diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/icu_properties/src | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/icu_properties/src')
-rw-r--r-- | third_party/rust/icu_properties/src/bidi.rs | 139 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/bidi_data.rs | 216 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/error.rs | 40 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/exemplar_chars.rs | 247 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/lib.rs | 115 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/maps.rs | 602 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/props.rs | 2365 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/provider.rs | 900 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/provider/bidi_data.rs | 289 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/provider/names.rs | 277 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/runtime.rs | 360 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/script.rs | 648 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/sets.rs | 2381 | ||||
-rw-r--r-- | third_party/rust/icu_properties/src/trievalue.rs | 248 |
14 files changed, 8827 insertions, 0 deletions
diff --git a/third_party/rust/icu_properties/src/bidi.rs b/third_party/rust/icu_properties/src/bidi.rs new file mode 100644 index 0000000000..ecbd6e74ed --- /dev/null +++ b/third_party/rust/icu_properties/src/bidi.rs @@ -0,0 +1,139 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! This module exposes tooling for running the [unicode bidi algorithm](https://unicode.org/reports/tr9/) using ICU4X data. +//! +//! `BidiClassAdapter` enables ICU4X to provide data to [`unicode-bidi`], an external crate implementing UAX #9. +//! +//! ✨ *Enabled with the `bidi` Cargo feature.* +//! +//! # Examples +//! +//!``` +//! use icu_properties::bidi::BidiClassAdapter; +//! use icu_properties::maps; +//! use unicode_bidi::BidiInfo; +//! // This example text is defined using `concat!` because some browsers +//! // and text editors have trouble displaying bidi strings. +//! let text = concat!["א", // RTL#1 +//! "ב", // RTL#2 +//! "ג", // RTL#3 +//! "a", // LTR#1 +//! "b", // LTR#2 +//! "c", // LTR#3 +//! ]; // +//! +//! +//! let adapter = BidiClassAdapter::new(maps::bidi_class()); +//! // Resolve embedding levels within the text. Pass `None` to detect the +//! // paragraph level automatically. +//! +//! let bidi_info = BidiInfo::new_with_data_source(&adapter, text, None); +//! +//! // This paragraph has embedding level 1 because its first strong character is RTL. +//! assert_eq!(bidi_info.paragraphs.len(), 1); +//! let para = &bidi_info.paragraphs[0]; +//! assert_eq!(para.level.number(), 1); +//! assert!(para.level.is_rtl()); +//! +//! // Re-ordering is done after wrapping each paragraph into a sequence of +//! // lines. For this example, I'll just use a single line that spans the +//! // entire paragraph. +//! let line = para.range.clone(); +//! +//! let display = bidi_info.reorder_line(para, line); +//! assert_eq!(display, concat!["a", // LTR#1 +//! "b", // LTR#2 +//! "c", // LTR#3 +//! "ג", // RTL#3 +//! "ב", // RTL#2 +//! "א", // RTL#1 +//! ]); +//! ``` + +use crate::maps::CodePointMapDataBorrowed; +use crate::props::BidiClass; +use unicode_bidi::data_source::BidiDataSource; +use unicode_bidi::BidiClass as DataSourceBidiClass; + +/// An adapter to convert from icu4x `BidiClass` to `unicode_bidi::BidiClass`. +/// +/// ✨ *Enabled with the `bidi` Cargo feature.* +/// +/// # Example +/// +/// ``` +/// use icu_collections::codepointtrie::CodePointTrie; +/// use icu_properties::bidi::BidiClassAdapter; +/// use icu_properties::{maps, BidiClass}; +/// use unicode_bidi::BidiClass as DataSourceBidiClass; +/// use unicode_bidi::BidiDataSource; +/// +/// let adapter = BidiClassAdapter::new(maps::bidi_class()); +/// assert_eq!(adapter.bidi_class('a'), DataSourceBidiClass::L); +/// assert_eq!(adapter.bidi_class('ع'), DataSourceBidiClass::AL); +/// ``` +#[derive(Debug)] +pub struct BidiClassAdapter<'a> { + data: CodePointMapDataBorrowed<'a, BidiClass>, +} + +impl<'a> BidiClassAdapter<'a> { + /// Creates new instance of `BidiClassAdapter`. + pub fn new(data: CodePointMapDataBorrowed<'a, BidiClass>) -> BidiClassAdapter<'a> { + BidiClassAdapter { data } + } +} + +impl<'a> BidiDataSource for BidiClassAdapter<'a> { + /// Returns a [`DataSourceBidiClass`] given a unicode character. + /// + /// # Example + /// + /// ``` + /// use icu_collections::codepointtrie::CodePointTrie; + /// use icu_properties::bidi::BidiClassAdapter; + /// use icu_properties::{maps, BidiClass}; + /// use unicode_bidi::BidiClass as DataSourceBidiClass; + /// use unicode_bidi::BidiDataSource; + /// + /// let adapter = BidiClassAdapter::new(maps::bidi_class()); + /// assert_eq!(adapter.bidi_class('a'), DataSourceBidiClass::L); + /// ``` + /// + /// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie + fn bidi_class(&self, c: char) -> DataSourceBidiClass { + let bidi_class = self.data.get(c); + match bidi_class { + BidiClass::LeftToRight => DataSourceBidiClass::L, + BidiClass::RightToLeft => DataSourceBidiClass::R, + BidiClass::EuropeanNumber => DataSourceBidiClass::EN, + BidiClass::EuropeanSeparator => DataSourceBidiClass::ES, + BidiClass::EuropeanTerminator => DataSourceBidiClass::ET, + BidiClass::ArabicNumber => DataSourceBidiClass::AN, + BidiClass::CommonSeparator => DataSourceBidiClass::CS, + BidiClass::ParagraphSeparator => DataSourceBidiClass::B, + BidiClass::SegmentSeparator => DataSourceBidiClass::S, + BidiClass::WhiteSpace => DataSourceBidiClass::WS, + BidiClass::OtherNeutral => DataSourceBidiClass::ON, + BidiClass::LeftToRightEmbedding => DataSourceBidiClass::LRE, + BidiClass::LeftToRightOverride => DataSourceBidiClass::LRO, + BidiClass::ArabicLetter => DataSourceBidiClass::AL, + BidiClass::RightToLeftEmbedding => DataSourceBidiClass::RLE, + BidiClass::RightToLeftOverride => DataSourceBidiClass::RLO, + BidiClass::PopDirectionalFormat => DataSourceBidiClass::PDF, + BidiClass::NonspacingMark => DataSourceBidiClass::NSM, + BidiClass::BoundaryNeutral => DataSourceBidiClass::BN, + BidiClass::FirstStrongIsolate => DataSourceBidiClass::FSI, + BidiClass::LeftToRightIsolate => DataSourceBidiClass::LRI, + BidiClass::RightToLeftIsolate => DataSourceBidiClass::RLI, + BidiClass::PopDirectionalIsolate => DataSourceBidiClass::PDI, + _ => + // This must not happen. + { + DataSourceBidiClass::ON + } + } + } +} diff --git a/third_party/rust/icu_properties/src/bidi_data.rs b/third_party/rust/icu_properties/src/bidi_data.rs new file mode 100644 index 0000000000..2356cda023 --- /dev/null +++ b/third_party/rust/icu_properties/src/bidi_data.rs @@ -0,0 +1,216 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Data and APIs for supporting specific Bidi properties data in an efficient structure. +//! +//! Supported properties are: +//! - `Bidi_Paired_Bracket` +//! - `Bidi_Paired_Bracket_Type` +//! - `Bidi_Mirrored` +//! - `Bidi_Mirroring_Glyph` + +use crate::provider::bidi_data::{ + BidiAuxiliaryPropertiesV1, BidiAuxiliaryPropertiesV1Marker, CheckedBidiPairedBracketType, +}; +use crate::PropertiesError; + +use icu_provider::prelude::*; + +/// A wrapper around certain Bidi properties data. Can be obtained via [`bidi_auxiliary_properties()`] and +/// related getters. +/// +/// Most useful methods are on [`BidiAuxiliaryPropertiesBorrowed`] obtained by calling [`BidiAuxiliaryProperties::as_borrowed()`] +#[derive(Debug)] +pub struct BidiAuxiliaryProperties { + data: DataPayload<BidiAuxiliaryPropertiesV1Marker>, +} + +impl BidiAuxiliaryProperties { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> BidiAuxiliaryPropertiesBorrowed<'_> { + BidiAuxiliaryPropertiesBorrowed { + data: self.data.get(), + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use getters like [`bidi_auxiliary_properties()`] instead + pub fn from_data(data: DataPayload<BidiAuxiliaryPropertiesV1Marker>) -> Self { + Self { data } + } +} + +/// This struct represents the properties Bidi_Mirrored and Bidi_Mirroring_Glyph. +/// If Bidi_Mirroring_Glyph is not defined for a code point, then the value in the +/// struct is `None`. +#[derive(Debug, Eq, PartialEq)] +#[non_exhaustive] +pub struct BidiMirroringProperties { + /// Represents the Bidi_Mirroring_Glyph property value + pub mirroring_glyph: Option<char>, + /// Represents the Bidi_Mirrored property value + pub mirrored: bool, +} + +/// The enum represents Bidi_Paired_Bracket_Type, the char represents Bidi_Paired_Bracket. +/// Bidi_Paired_Bracket has a value of `None` when Bidi_Paired_Bracket_Type is `None`. +#[derive(Debug, Eq, PartialEq)] +#[non_exhaustive] +pub enum BidiPairingProperties { + /// Represents Bidi_Paired_Bracket_Type=Open, and the Bidi_Paired_Bracket value for that code point. + Open(char), + /// Represents Bidi_Paired_Bracket_Type=Close, and the Bidi_Paired_Bracket value for that code point. + Close(char), + /// Represents Bidi_Paired_Bracket_Type=None, which cooccurs with Bidi_Paired_Bracket + /// being undefined for that code point. + None, +} + +/// A borrowed wrapper around Bidi properties data, returned by +/// [`BidiAuxiliaryProperties::as_borrowed()`]. More efficient to query. +#[derive(Debug)] +pub struct BidiAuxiliaryPropertiesBorrowed<'a> { + data: &'a BidiAuxiliaryPropertiesV1<'a>, +} + +impl<'a> BidiAuxiliaryPropertiesBorrowed<'a> { + // The source data coming from icuexportdata will use 0 to represent the + // property value in cases for which the Bidi_Mirroring_Glyph property value + // of a code point is undefined. Since Rust types can be more expressive, we + // should represent these cases as None. + fn convert_mirroring_glyph_data(trie_data_char: char) -> Option<char> { + if trie_data_char as u32 == 0 { + None + } else { + Some(trie_data_char) + } + } + + /// Return a struct for the given code point representing Bidi mirroring-related + /// property values. See [`BidiMirroringProperties`]. + /// + /// # Examples + /// ``` + /// use icu_properties::{bidi_data, bidi_data::BidiMirroringProperties}; + /// + /// let bidi_data = bidi_data::bidi_auxiliary_properties(); + /// + /// let open_paren = bidi_data.get32_mirroring_props('(' as u32); + /// assert_eq!(open_paren.mirroring_glyph, Some(')')); + /// assert_eq!(open_paren.mirrored, true); + /// let close_paren = bidi_data.get32_mirroring_props(')' as u32); + /// assert_eq!(close_paren.mirroring_glyph, Some('(')); + /// assert_eq!(close_paren.mirrored, true); + /// let open_angle_bracket = bidi_data.get32_mirroring_props('<' as u32); + /// assert_eq!(open_angle_bracket.mirroring_glyph, Some('>')); + /// assert_eq!(open_angle_bracket.mirrored, true); + /// let close_angle_bracket = bidi_data.get32_mirroring_props('>' as u32); + /// assert_eq!(close_angle_bracket.mirroring_glyph, Some('<')); + /// assert_eq!(close_angle_bracket.mirrored, true); + /// let three = bidi_data.get32_mirroring_props('3' as u32); + /// assert_eq!(three.mirroring_glyph, None); + /// assert_eq!(three.mirrored, false); + /// ``` + pub fn get32_mirroring_props(&self, code_point: u32) -> BidiMirroringProperties { + let bidi_aux_props = self.data.trie.get32(code_point); + let mirroring_glyph_opt = + Self::convert_mirroring_glyph_data(bidi_aux_props.mirroring_glyph); + BidiMirroringProperties { + mirroring_glyph: mirroring_glyph_opt, + mirrored: bidi_aux_props.mirrored, + } + } + + /// Return a struct for the given code point representing Bidi bracket + /// pairing-related property values. See [`BidiPairingProperties`] + /// + /// # Examples + /// ``` + /// use icu_properties::{bidi_data, bidi_data::BidiPairingProperties}; + /// + /// let bidi_data = bidi_data::bidi_auxiliary_properties(); + /// + /// let open_paren = bidi_data.get32_pairing_props('(' as u32); + /// assert_eq!(open_paren, BidiPairingProperties::Open(')')); + /// let close_paren = bidi_data.get32_pairing_props(')' as u32); + /// assert_eq!(close_paren, BidiPairingProperties::Close('(')); + /// let open_angle_bracket = bidi_data.get32_pairing_props('<' as u32); + /// assert_eq!(open_angle_bracket, BidiPairingProperties::None); + /// let close_angle_bracket = bidi_data.get32_pairing_props('>' as u32); + /// assert_eq!(close_angle_bracket, BidiPairingProperties::None); + /// let three = bidi_data.get32_pairing_props('3' as u32); + /// assert_eq!(three, BidiPairingProperties::None); + /// ``` + pub fn get32_pairing_props(&self, code_point: u32) -> BidiPairingProperties { + let bidi_aux_props = self.data.trie.get32(code_point); + let mirroring_glyph = bidi_aux_props.mirroring_glyph; + let paired_bracket_type = bidi_aux_props.paired_bracket_type; + match paired_bracket_type { + CheckedBidiPairedBracketType::Open => BidiPairingProperties::Open(mirroring_glyph), + CheckedBidiPairedBracketType::Close => BidiPairingProperties::Close(mirroring_glyph), + _ => BidiPairingProperties::None, + } + } +} + +impl BidiAuxiliaryPropertiesBorrowed<'static> { + /// Cheaply converts a `BidiAuxiliaryPropertiesBorrowed<'static>` into a `BidiAuxiliaryProperties`. + pub const fn static_to_owned(self) -> BidiAuxiliaryProperties { + BidiAuxiliaryProperties { + data: DataPayload::from_static_ref(self.data), + } + } +} + +/// Creates a [`BidiAuxiliaryPropertiesV1`] struct that represents the data for certain +/// Bidi properties. +/// +/// ✨ *Enabled with the `compiled_data` Cargo feature.* +/// +/// [📚 Help choosing a constructor](icu_provider::constructors) +/// +/// # Examples +/// ``` +/// use icu_properties::{bidi_data, bidi_data::BidiMirroringProperties}; +/// +/// let bidi_data = bidi_data::bidi_auxiliary_properties(); +/// +/// let open_paren = bidi_data.get32_mirroring_props('(' as u32); +/// assert_eq!(open_paren.mirroring_glyph, Some(')')); +/// assert_eq!(open_paren.mirrored, true); +/// ``` +#[cfg(feature = "compiled_data")] +pub const fn bidi_auxiliary_properties() -> BidiAuxiliaryPropertiesBorrowed<'static> { + BidiAuxiliaryPropertiesBorrowed { + data: crate::provider::Baked::SINGLETON_PROPS_BIDIAUXILIARYPROPS_V1, + } +} + +icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + options: skip, + result: Result<BidiAuxiliaryProperties, PropertiesError>, + #[cfg(skip)] + functions: [ + bidi_auxiliary_properties, + load_bidi_auxiliary_properties_with_any_provider, + load_bidi_auxiliary_properties_with_buffer_provider, + load_bidi_auxiliary_properties_unstable, + ] +); + +#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, bidi_auxiliary_properties)] +pub fn load_bidi_auxiliary_properties_unstable( + provider: &(impl DataProvider<BidiAuxiliaryPropertiesV1Marker> + ?Sized), +) -> Result<BidiAuxiliaryProperties, PropertiesError> { + Ok(provider + .load(Default::default()) + .and_then(DataResponse::take_payload) + .map(BidiAuxiliaryProperties::from_data)?) +} diff --git a/third_party/rust/icu_properties/src/error.rs b/third_party/rust/icu_properties/src/error.rs new file mode 100644 index 0000000000..1526e75790 --- /dev/null +++ b/third_party/rust/icu_properties/src/error.rs @@ -0,0 +1,40 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use displaydoc::Display; +use icu_provider::DataError; + +#[cfg(doc)] +use crate::GeneralCategoryGroup; +#[cfg(doc)] +use crate::Script; + +#[cfg(feature = "std")] +impl std::error::Error for PropertiesError {} + +/// A list of error outcomes for various operations in this module. +/// +/// Re-exported as [`Error`](crate::Error). +#[derive(Display, Debug, Copy, Clone)] +#[non_exhaustive] +pub enum PropertiesError { + /// An error occurred while loading data + #[displaydoc("{0}")] + PropDataLoad(DataError), + /// An unknown value was used for the [`Script`](crate::Script) property + #[displaydoc("Unknown script id: {0}")] + UnknownScriptId(u16), + /// An unknown value was used for the [`GeneralCategoryGroup`](crate::GeneralCategoryGroup) property + #[displaydoc("Unknown general category group: {0}")] + UnknownGeneralCategoryGroup(u32), + /// An unknown or unexpected property name was used for an API dealing with properties specified as strings at runtime + #[displaydoc("Unexpected or unknown property name")] + UnexpectedPropertyName, +} + +impl From<DataError> for PropertiesError { + fn from(e: DataError) -> Self { + PropertiesError::PropDataLoad(e) + } +} diff --git a/third_party/rust/icu_properties/src/exemplar_chars.rs b/third_party/rust/icu_properties/src/exemplar_chars.rs new file mode 100644 index 0000000000..2dd7b343bf --- /dev/null +++ b/third_party/rust/icu_properties/src/exemplar_chars.rs @@ -0,0 +1,247 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! This module provides APIs for getting exemplar characters for a locale. +//! +//! Exemplars are characters used by a language, separated into different sets. +//! The sets are: main, auxiliary, punctuation, numbers, and index. +//! +//! The sets define, according to typical usage in the language, +//! which characters occur in which contexts with which frequency. +//! For more information, see the documentation in the +//! [Exemplars section in Unicode Technical Standard #35](https://unicode.org/reports/tr35/tr35-general.html#Exemplars) +//! of the LDML specification. +//! +//! # Examples +//! +//! ``` +//! use icu::locid::locale; +//! use icu::properties::exemplar_chars; +//! +//! let locale = locale!("en-001").into(); +//! let data = exemplar_chars::exemplars_main(&locale) +//! .expect("locale should be present"); +//! let exemplars_main = data.as_borrowed(); +//! +//! assert!(exemplars_main.contains_char('a')); +//! assert!(exemplars_main.contains_char('z')); +//! assert!(exemplars_main.contains("a")); +//! assert!(!exemplars_main.contains("ä")); +//! assert!(!exemplars_main.contains("ng")); +//! ``` + +use crate::provider::*; +use crate::sets::UnicodeSetData; +use crate::PropertiesError; +use icu_provider::prelude::*; + +macro_rules! make_exemplar_chars_unicode_set_property { + ( + // currently unused + marker: $marker_name:ident; + keyed_data_marker: $keyed_data_marker:ty; + func: + $vis:vis fn $funcname:ident(); + $(#[$attr:meta])* + $vis2:vis fn $constname:ident(); + ) => { + #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")] + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + $vis fn $funcname( + provider: &(impl DataProvider<$keyed_data_marker> + ?Sized), + locale: &DataLocale, + ) -> Result<UnicodeSetData, PropertiesError> { + Ok(provider.load( + DataRequest { + locale, + metadata: Default::default(), + }) + .and_then(DataResponse::take_payload) + .map(UnicodeSetData::from_data)? + ) + } + $(#[$attr])* + #[cfg(feature = "compiled_data")] + $vis2 fn $constname( + locale: &DataLocale, + ) -> Result<UnicodeSetData, PropertiesError> { + Ok(UnicodeSetData::from_data( + DataProvider::<$keyed_data_marker>::load( + &crate::provider::Baked, + DataRequest { + locale, + metadata: Default::default(), + }) + .and_then(DataResponse::take_payload)? + )) + } + } +} + +make_exemplar_chars_unicode_set_property!( + marker: ExemplarCharactersMain; + keyed_data_marker: ExemplarCharactersMainV1Marker; + func: + pub fn load_exemplars_main(); + + /// Get the "main" set of exemplar characters. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use icu::properties::exemplar_chars; + /// + /// let data = exemplar_chars::exemplars_main(&locale!("en").into()) + /// .expect("locale should be present"); + /// let exemplars_main = data.as_borrowed(); + /// + /// assert!(exemplars_main.contains_char('a')); + /// assert!(exemplars_main.contains_char('z')); + /// assert!(exemplars_main.contains("a")); + /// assert!(!exemplars_main.contains("ä")); + /// assert!(!exemplars_main.contains("ng")); + /// assert!(!exemplars_main.contains("A")); + /// ``` + pub fn exemplars_main(); +); + +make_exemplar_chars_unicode_set_property!( + marker: ExemplarCharactersAuxiliary; + keyed_data_marker: ExemplarCharactersAuxiliaryV1Marker; + func: + pub fn load_exemplars_auxiliary(); + + /// Get the "auxiliary" set of exemplar characters. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use icu::properties::exemplar_chars; + /// + /// let data = + /// exemplar_chars::exemplars_auxiliary(&locale!("en").into()) + /// .expect("locale should be present"); + /// let exemplars_auxiliary = data.as_borrowed(); + /// + /// assert!(!exemplars_auxiliary.contains_char('a')); + /// assert!(!exemplars_auxiliary.contains_char('z')); + /// assert!(!exemplars_auxiliary.contains("a")); + /// assert!(exemplars_auxiliary.contains("ä")); + /// assert!(!exemplars_auxiliary.contains("ng")); + /// assert!(!exemplars_auxiliary.contains("A")); + /// ``` + pub fn exemplars_auxiliary(); +); + +make_exemplar_chars_unicode_set_property!( + marker: ExemplarCharactersPunctuation; + keyed_data_marker: ExemplarCharactersPunctuationV1Marker; + func: + pub fn load_exemplars_punctuation(); + + /// Get the "punctuation" set of exemplar characters. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use icu::properties::exemplar_chars; + /// + /// let data = + /// exemplar_chars::exemplars_punctuation(&locale!("en").into()) + /// .expect("locale should be present"); + /// let exemplars_punctuation = data.as_borrowed(); + /// + /// assert!(!exemplars_punctuation.contains_char('0')); + /// assert!(!exemplars_punctuation.contains_char('9')); + /// assert!(!exemplars_punctuation.contains_char('%')); + /// assert!(exemplars_punctuation.contains_char(',')); + /// assert!(exemplars_punctuation.contains_char('.')); + /// assert!(exemplars_punctuation.contains_char('!')); + /// assert!(exemplars_punctuation.contains_char('?')); + /// ``` + pub fn exemplars_punctuation(); +); + +make_exemplar_chars_unicode_set_property!( + marker: ExemplarCharactersNumbers; + keyed_data_marker: ExemplarCharactersNumbersV1Marker; + func: + pub fn load_exemplars_numbers(); + + /// Get the "numbers" set of exemplar characters. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use icu::properties::exemplar_chars; + /// + /// let data = + /// exemplar_chars::exemplars_numbers(&locale!("en").into()) + /// .expect("locale should be present"); + /// let exemplars_numbers = data.as_borrowed(); + /// + /// assert!(exemplars_numbers.contains_char('0')); + /// assert!(exemplars_numbers.contains_char('9')); + /// assert!(exemplars_numbers.contains_char('%')); + /// assert!(exemplars_numbers.contains_char(',')); + /// assert!(exemplars_numbers.contains_char('.')); + /// assert!(!exemplars_numbers.contains_char('!')); + /// assert!(!exemplars_numbers.contains_char('?')); + /// ``` + pub fn exemplars_numbers(); +); + +make_exemplar_chars_unicode_set_property!( + marker: ExemplarCharactersIndex; + keyed_data_marker: ExemplarCharactersIndexV1Marker; + func: + pub fn load_exemplars_index(); + + /// Get the "index" set of exemplar characters. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use icu::properties::exemplar_chars; + /// + /// let data = + /// exemplar_chars::exemplars_index(&locale!("en").into()) + /// .expect("locale should be present"); + /// let exemplars_index = data.as_borrowed(); + /// + /// assert!(!exemplars_index.contains_char('a')); + /// assert!(!exemplars_index.contains_char('z')); + /// assert!(!exemplars_index.contains("a")); + /// assert!(!exemplars_index.contains("ä")); + /// assert!(!exemplars_index.contains("ng")); + /// assert!(exemplars_index.contains("A")); + /// ``` + pub fn exemplars_index(); +); diff --git a/third_party/rust/icu_properties/src/lib.rs b/third_party/rust/icu_properties/src/lib.rs new file mode 100644 index 0000000000..3b9a236e23 --- /dev/null +++ b/third_party/rust/icu_properties/src/lib.rs @@ -0,0 +1,115 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Definitions of [Unicode Properties] and APIs for +//! retrieving property data in an appropriate data structure. +//! +//! This module is published as its own crate ([`icu_properties`](https://docs.rs/icu_properties/latest/icu_properties/)) +//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. +//! +//! APIs that return a [`CodePointSetData`] exist for binary properties and certain enumerated +//! properties. See the [`sets`] module for more details. +//! +//! APIs that return a [`CodePointMapData`] exist for certain enumerated properties. See the +//! [`maps`] module for more details. +//! +//! # Examples +//! +//! ## Property data as `CodePointSetData`s +//! +//! ``` +//! use icu::properties::{maps, sets, GeneralCategory}; +//! +//! // A binary property as a `CodePointSetData` +//! +//! assert!(sets::emoji().contains('🎃')); // U+1F383 JACK-O-LANTERN +//! assert!(!sets::emoji().contains('木')); // U+6728 +//! +//! // An individual enumerated property value as a `CodePointSetData` +//! +//! let line_sep_data = maps::general_category() +//! .get_set_for_value(GeneralCategory::LineSeparator); +//! let line_sep = line_sep_data.as_borrowed(); +//! +//! assert!(line_sep.contains32(0x2028)); +//! assert!(!line_sep.contains32(0x2029)); +//! ``` +//! +//! ## Property data as `CodePointMapData`s +//! +//! ``` +//! use icu::properties::{maps, Script}; +//! +//! assert_eq!(maps::script().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN +//! assert_eq!(maps::script().get('木'), Script::Han); // U+6728 +//! ``` +//! +//! [`ICU4X`]: ../icu/index.html +//! [Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html +//! [`CodePointSetData`]: crate::sets::CodePointSetData +//! [`CodePointMapData`]: crate::maps::CodePointMapData +//! [`sets`]: crate::sets + +// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations +#![cfg_attr(not(any(test, feature = "std")), no_std)] +#![cfg_attr( + not(test), + deny( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::exhaustive_structs, + clippy::exhaustive_enums, + missing_debug_implementations, + ) +)] +#![warn(missing_docs)] + +extern crate alloc; + +#[cfg(feature = "bidi")] +pub mod bidi; + +mod error; +pub mod maps; + +// NOTE: The Pernosco debugger has special knowledge +// of the `CanonicalCombiningClass` struct inside the `props` +// module. Please do not change the crate-module-qualified +// name of that struct without coordination. +mod props; + +pub mod bidi_data; +pub mod exemplar_chars; +pub mod provider; +pub(crate) mod runtime; +#[allow(clippy::exhaustive_structs)] // TODO +pub mod script; +pub mod sets; +mod trievalue; + +pub use props::{ + BidiClass, CanonicalCombiningClass, EastAsianWidth, GeneralCategory, GeneralCategoryGroup, + GraphemeClusterBreak, IndicSyllabicCategory, LineBreak, Script, SentenceBreak, WordBreak, +}; + +/// Module for working with the names of property values +pub mod names { + pub use crate::props::{ + PropertyEnumToValueNameLinearMapper, PropertyEnumToValueNameLinearMapperBorrowed, + }; + pub use crate::props::{ + PropertyEnumToValueNameLinearTiny4Mapper, PropertyEnumToValueNameLinearTiny4MapperBorrowed, + }; + pub use crate::props::{ + PropertyEnumToValueNameSparseMapper, PropertyEnumToValueNameSparseMapperBorrowed, + }; + pub use crate::props::{PropertyValueNameToEnumMapper, PropertyValueNameToEnumMapperBorrowed}; +} + +pub use error::PropertiesError; + +#[doc(no_inline)] +pub use PropertiesError as Error; diff --git a/third_party/rust/icu_properties/src/maps.rs b/third_party/rust/icu_properties/src/maps.rs new file mode 100644 index 0000000000..478ef5f2c1 --- /dev/null +++ b/third_party/rust/icu_properties/src/maps.rs @@ -0,0 +1,602 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! The functions in this module return a [`CodePointMapData`] representing, for +//! each code point in the entire range of code points, the property values +//! for a particular Unicode property. +//! +//! The descriptions of most properties are taken from [`TR44`], the documentation for the +//! Unicode Character Database. +//! +//! [`TR44`]: https://www.unicode.org/reports/tr44 + +use crate::error::PropertiesError; +use crate::provider::*; +use crate::sets::CodePointSetData; +#[cfg(doc)] +use crate::*; +use core::marker::PhantomData; +use core::ops::RangeInclusive; +use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue}; +use icu_provider::prelude::*; +use zerovec::ZeroVecError; + +/// A wrapper around code point map data. It is returned by APIs that return Unicode +/// property data in a map-like form, ex: enumerated property value data keyed +/// by code point. Access its data via the borrowed version, +/// [`CodePointMapDataBorrowed`]. +#[derive(Debug, Clone)] +pub struct CodePointMapData<T: TrieValue> { + data: DataPayload<ErasedMaplikeMarker<T>>, +} + +/// Private marker type for CodePointMapData +/// to work for all same-value map properties at once +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +struct ErasedMaplikeMarker<T>(PhantomData<T>); +impl<T: TrieValue> DataMarker for ErasedMaplikeMarker<T> { + type Yokeable = PropertyCodePointMapV1<'static, T>; +} + +impl<T: TrieValue> CodePointMapData<T> { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it + /// up front. + /// + /// This owned version if returned by functions that use a runtime data provider. + #[inline] + pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> { + CodePointMapDataBorrowed { + map: self.data.get(), + } + } + + /// Convert this map to a map around another type + /// + /// Typically useful for type-erasing maps into maps around integers. + /// + /// # Panics + /// Will panic if T and P are different sizes + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, GeneralCategory}; + /// + /// let data = maps::general_category().static_to_owned(); + /// + /// let gc = data.try_into_converted::<u8>().unwrap(); + /// let gc = gc.as_borrowed(); + /// + /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728 + /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN + /// ``` + pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, ZeroVecError> + where + P: TrieValue, + { + self.data + .try_map_project::<ErasedMaplikeMarker<P>, _, _>(move |data, _| { + data.try_into_converted() + }) + .map(CodePointMapData::from_data) + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use getters like [`load_general_category()`] instead + pub fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyCodePointMapV1<'static, T>>, + { + Self { data: data.cast() } + } + + /// Construct a new one an owned [`CodePointTrie`] + pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self { + let set = PropertyCodePointMapV1::from_code_point_trie(trie); + CodePointMapData::from_data(DataPayload::<ErasedMaplikeMarker<T>>::from_owned(set)) + } + + /// Convert this type to a [`CodePointTrie`] as a borrowed value. + /// + /// The data backing this is extensible and supports multiple implementations. + /// Currently it is always [`CodePointTrie`]; however in the future more backends may be + /// added, and users may select which at data generation time. + /// + /// This method returns an `Option` in order to return `None` when the backing data provider + /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time + /// constraint. + pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> { + self.data.get().as_code_point_trie() + } + + /// Convert this type to a [`CodePointTrie`], borrowing if possible, + /// otherwise allocating a new [`CodePointTrie`]. + /// + /// The data backing this is extensible and supports multiple implementations. + /// Currently it is always [`CodePointTrie`]; however in the future more backends may be + /// added, and users may select which at data generation time. + /// + /// The performance of the conversion to this specific return type will vary + /// depending on the data structure that is backing `self`. + pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> { + self.data.get().to_code_point_trie() + } +} + +/// A borrowed wrapper around code point set data, returned by +/// [`CodePointSetData::as_borrowed()`]. More efficient to query. +#[derive(Clone, Copy, Debug)] +pub struct CodePointMapDataBorrowed<'a, T: TrieValue> { + map: &'a PropertyCodePointMapV1<'a, T>, +} + +impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> { + /// Get the value this map has associated with code point `ch` + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, GeneralCategory}; + /// + /// let gc = maps::general_category(); + /// + /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728 + /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN + /// ``` + pub fn get(self, ch: char) -> T { + self.map.get32(ch as u32) + } + + /// Get the value this map has associated with code point `ch` + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, GeneralCategory}; + /// + /// let gc = maps::general_category(); + /// + /// assert_eq!(gc.get32(0x6728), GeneralCategory::OtherLetter); // U+6728 (木) + /// assert_eq!(gc.get32(0x1F383), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN + /// ``` + pub fn get32(self, ch: u32) -> T { + self.map.get32(ch) + } + + /// Get a [`CodePointSetData`] for all elements corresponding to a particular value + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, GeneralCategory}; + /// + /// let gc = maps::general_category(); + /// + /// let other_letter_set_data = + /// gc.get_set_for_value(GeneralCategory::OtherLetter); + /// let other_letter_set = other_letter_set_data.as_borrowed(); + /// + /// assert!(other_letter_set.contains('木')); // U+6728 + /// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN + /// ``` + pub fn get_set_for_value(self, value: T) -> CodePointSetData { + let set = self.map.get_set_for_value(value); + CodePointSetData::from_code_point_inversion_list(set) + } + + /// Yields an [`Iterator`] returning ranges of consecutive code points that + /// share the same value in the [`CodePointMapData`]. + /// + /// # Examples + /// + /// ``` + /// use core::ops::RangeInclusive; + /// use icu::properties::maps::{self, CodePointMapData}; + /// use icu::properties::GeneralCategory; + /// + /// let gc = maps::general_category(); + /// let mut ranges = gc.iter_ranges(); + /// let next = ranges.next().unwrap(); + /// assert_eq!(next.range, 0..=31); + /// assert_eq!(next.value, GeneralCategory::Control); + /// let next = ranges.next().unwrap(); + /// assert_eq!(next.range, 32..=32); + /// assert_eq!(next.value, GeneralCategory::SpaceSeparator); + /// ``` + pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a { + self.map.iter_ranges() + } + + /// Yields an [`Iterator`] returning ranges of consecutive code points that + /// share the same value `v` in the [`CodePointMapData`]. + /// + /// # Examples + /// + /// + /// ``` + /// use core::ops::RangeInclusive; + /// use icu::properties::maps::{self, CodePointMapData}; + /// use icu::properties::GeneralCategory; + /// + /// let gc = maps::general_category(); + /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter); + /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32); + /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32); + /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32); + /// ``` + pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { + self.map + .iter_ranges() + .filter(move |r| r.value == val) + .map(|r| r.range) + } + + /// Yields an [`Iterator`] returning ranges of consecutive code points that + /// do *not* have the value `v` in the [`CodePointMapData`]. + pub fn iter_ranges_for_value_complemented( + self, + val: T, + ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { + self.map + .iter_ranges_mapped(move |value| value != val) + .filter(|v| v.value) + .map(|v| v.range) + } + + /// Exposed for FFI needs, could be exposed in general in the future but we should + /// have a use case first. + /// + /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()` + #[doc(hidden)] + pub fn iter_ranges_mapped<U: Eq + 'a>( + self, + predicate: impl FnMut(T) -> U + Copy + 'a, + ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a { + self.map.iter_ranges_mapped(predicate) + } +} + +impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> { + /// Cheaply converts a `CodePointMapDataBorrowed<'static>` into a `CodePointMapData`. + pub const fn static_to_owned(self) -> CodePointMapData<T> { + CodePointMapData { + data: DataPayload::from_static_ref(self.map), + } + } +} + +impl<'a> CodePointMapDataBorrowed<'a, crate::GeneralCategory> { + /// Yields an [`Iterator`] returning ranges of consecutive code points that + /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`] + /// + /// # Examples + /// + /// + /// ``` + /// use core::ops::RangeInclusive; + /// use icu::properties::maps::{self, CodePointMapData}; + /// use icu::properties::GeneralCategoryGroup; + /// + /// let gc = maps::general_category(); + /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter); + /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32); + /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32); + /// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32); + /// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32); + /// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32); + /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32); + /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32); + /// ``` + pub fn iter_ranges_for_group( + self, + group: crate::GeneralCategoryGroup, + ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { + self.map + .iter_ranges_mapped(move |value| group.contains(value)) + .filter(|v| v.value) + .map(|v| v.range) + } +} + +macro_rules! make_map_property { + ( + // currently unused + property: $prop_name:expr; + // currently unused + marker: $marker_name:ident; + value: $value_ty:path; + keyed_data_marker: $keyed_data_marker:ty; + func: + $(#[$doc:meta])* + $vis2:vis const $constname:ident => $singleton:ident; + $vis:vis fn $name:ident(); + ) => { + #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")] + /// + /// Note that this will return an owned version of the data. Functionality is available on + /// the borrowed version, accessible through [`CodePointMapData::as_borrowed`]. + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + $vis fn $name( + provider: &(impl DataProvider<$keyed_data_marker> + ?Sized) + ) -> Result<CodePointMapData<$value_ty>, PropertiesError> { + Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(CodePointMapData::from_data)?) + } + $(#[$doc])* + #[cfg(feature = "compiled_data")] + pub const fn $constname() -> CodePointMapDataBorrowed<'static, $value_ty> { + CodePointMapDataBorrowed { + map: crate::provider::Baked::$singleton + } + } + }; +} + +make_map_property! { + property: "General_Category"; + marker: GeneralCategoryProperty; + value: crate::GeneralCategory; + keyed_data_marker: GeneralCategoryV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the General_Category Unicode enumerated property. See [`GeneralCategory`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, GeneralCategory}; + /// + /// assert_eq!(maps::general_category().get('木'), GeneralCategory::OtherLetter); // U+6728 + /// assert_eq!(maps::general_category().get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN + /// ``` + pub const general_category => SINGLETON_PROPS_GC_V1; + pub fn load_general_category(); +} + +make_map_property! { + property: "Bidi_Class"; + marker: BidiClassProperty; + value: crate::BidiClass; + keyed_data_marker: BidiClassV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the Bidi_Class Unicode enumerated property. See [`BidiClass`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, BidiClass}; + /// + /// assert_eq!(maps::bidi_class().get('y'), BidiClass::LeftToRight); // U+0079 + /// assert_eq!(maps::bidi_class().get('ع'), BidiClass::ArabicLetter); // U+0639 + /// ``` + pub const bidi_class => SINGLETON_PROPS_BC_V1; + pub fn load_bidi_class(); +} + +make_map_property! { + property: "Script"; + marker: ScriptProperty; + value: crate::Script; + keyed_data_marker: ScriptV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the Script Unicode enumerated property. See [`Script`]. + /// + /// **Note:** Some code points are associated with multiple scripts. If you are trying to + /// determine whether a code point belongs to a certain script, you should use + /// [`load_script_with_extensions_unstable`] and [`ScriptWithExtensionsBorrowed::has_script`] + /// instead of this function. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, Script}; + /// + /// assert_eq!(maps::script().get('木'), Script::Han); // U+6728 + /// assert_eq!(maps::script().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN + /// ``` + /// [`load_script_with_extensions_unstable`]: crate::script::load_script_with_extensions_unstable + /// [`ScriptWithExtensionsBorrowed::has_script`]: crate::script::ScriptWithExtensionsBorrowed::has_script + pub const script => SINGLETON_PROPS_SC_V1; + pub fn load_script(); +} + +make_map_property! { + property: "East_Asian_Width"; + marker: EastAsianWidthProperty; + value: crate::EastAsianWidth; + keyed_data_marker: EastAsianWidthV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the East_Asian_Width Unicode enumerated + /// property. See [`EastAsianWidth`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, EastAsianWidth}; + /// + /// assert_eq!(maps::east_asian_width().get('ア'), EastAsianWidth::Halfwidth); // U+FF71: Halfwidth Katakana Letter A + /// assert_eq!(maps::east_asian_width().get('ア'), EastAsianWidth::Wide); //U+30A2: Katakana Letter A + /// ``` + pub const east_asian_width => SINGLETON_PROPS_EA_V1; + pub fn load_east_asian_width(); +} + +make_map_property! { + property: "Line_Break"; + marker: LineBreakProperty; + value: crate::LineBreak; + keyed_data_marker: LineBreakV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the Line_Break Unicode enumerated + /// property. See [`LineBreak`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, LineBreak}; + /// + /// assert_eq!(maps::line_break().get(')'), LineBreak::CloseParenthesis); // U+0029: Right Parenthesis + /// assert_eq!(maps::line_break().get('ぁ'), LineBreak::ConditionalJapaneseStarter); //U+3041: Hiragana Letter Small A + /// ``` + pub const line_break => SINGLETON_PROPS_LB_V1; + pub fn load_line_break(); +} + +make_map_property! { + property: "Grapheme_Cluster_Break"; + marker: GraphemeClusterBreakProperty; + value: crate::GraphemeClusterBreak; + keyed_data_marker: GraphemeClusterBreakV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the Grapheme_Cluster_Break Unicode enumerated + /// property. See [`GraphemeClusterBreak`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, GraphemeClusterBreak}; + /// + /// assert_eq!(maps::grapheme_cluster_break().get('🇦'), GraphemeClusterBreak::RegionalIndicator); // U+1F1E6: Regional Indicator Symbol Letter A + /// assert_eq!(maps::grapheme_cluster_break().get('ำ'), GraphemeClusterBreak::SpacingMark); //U+0E33: Thai Character Sara Am + /// ``` + pub const grapheme_cluster_break => SINGLETON_PROPS_GCB_V1; + pub fn load_grapheme_cluster_break(); +} + +make_map_property! { + property: "Word_Break"; + marker: WordBreakProperty; + value: crate::WordBreak; + keyed_data_marker: WordBreakV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the Word_Break Unicode enumerated + /// property. See [`WordBreak`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, WordBreak}; + /// + /// assert_eq!(maps::word_break().get('.'), WordBreak::MidNumLet); // U+002E: Full Stop + /// assert_eq!(maps::word_break().get(','), WordBreak::MidNum); // U+FF0C: Fullwidth Comma + /// ``` + pub const word_break => SINGLETON_PROPS_WB_V1; + pub fn load_word_break(); +} + +make_map_property! { + property: "Sentence_Break"; + marker: SentenceBreakProperty; + value: crate::SentenceBreak; + keyed_data_marker: SentenceBreakV1Marker; + func: + /// Return a [`CodePointMapDataBorrowed`] for the Sentence_Break Unicode enumerated + /// property. See [`SentenceBreak`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, SentenceBreak}; + /// + /// assert_eq!(maps::sentence_break().get('9'), SentenceBreak::Numeric); // U+FF19: Fullwidth Digit Nine + /// assert_eq!(maps::sentence_break().get(','), SentenceBreak::SContinue); // U+002C: Comma + /// ``` + pub const sentence_break => SINGLETON_PROPS_SB_V1; + pub fn load_sentence_break(); +} + +make_map_property! { + property: "Canonical_Combining_Class"; + marker: CanonicalCombiningClassProperty; + value: crate::CanonicalCombiningClass; + keyed_data_marker: CanonicalCombiningClassV1Marker; + func: + /// Return a [`CodePointMapData`] for the Canonical_Combining_Class Unicode property. See + /// [`CanonicalCombiningClass`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// **Note:** See `icu_normalizer::CanonicalCombiningClassMap` for the preferred API + /// to look up the Canonical_Combining_Class property by scalar value. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, CanonicalCombiningClass}; + /// + /// assert_eq!(maps::canonical_combining_class().get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A + /// assert_eq!(maps::canonical_combining_class().get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT + /// ``` + pub const canonical_combining_class => SINGLETON_PROPS_CCC_V1; + pub fn load_canonical_combining_class(); +} + +make_map_property! { + property: "Indic_Syllabic_Category"; + marker: IndicSyllabicCategoryProperty; + value: crate::IndicSyllabicCategory; + keyed_data_marker: IndicSyllabicCategoryV1Marker; + func: + /// Return a [`CodePointMapData`] for the Indic_Syllabic_Category Unicode property. See + /// [`IndicSyllabicCategory`]. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::{maps, IndicSyllabicCategory}; + /// + /// assert_eq!(maps::indic_syllabic_category().get('a'), IndicSyllabicCategory::Other); + /// assert_eq!(maps::indic_syllabic_category().get32(0x0900), IndicSyllabicCategory::Bindu); // U+0900: DEVANAGARI SIGN INVERTED CANDRABINDU + /// ``` + pub const indic_syllabic_category => SINGLETON_PROPS_INSC_V1; + pub fn load_indic_syllabic_category(); +} diff --git a/third_party/rust/icu_properties/src/props.rs b/third_party/rust/icu_properties/src/props.rs new file mode 100644 index 0000000000..247b505c81 --- /dev/null +++ b/third_party/rust/icu_properties/src/props.rs @@ -0,0 +1,2365 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! A collection of property definitions shared across contexts +//! (ex: representing trie values). +//! +//! This module defines enums / newtypes for enumerated properties. +//! String properties are represented as newtypes if their +//! values represent code points. + +use crate::provider::{names::*, *}; +use crate::PropertiesError; +use core::marker::PhantomData; +use icu_collections::codepointtrie::TrieValue; +use icu_provider::prelude::*; +use zerovec::ule::VarULE; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +/// Private marker type for PropertyValueNameToEnumMapper +/// to work for all properties at once +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) struct ErasedNameToEnumMapV1Marker; +impl DataMarker for ErasedNameToEnumMapV1Marker { + type Yokeable = PropertyValueNameToEnumMapV1<'static>; +} + +/// A struct capable of looking up a property value from a string name. +/// Access its data by calling [`Self::as_borrowed()`] and using the methods on +/// [`PropertyValueNameToEnumMapperBorrowed`]. +/// +/// The name can be a short name (`Lu`), a long name(`Uppercase_Letter`), +/// or an alias. +/// +/// Property names can be looked up using "strict" matching (looking for a name +/// that matches exactly), or "loose matching", where the name is allowed to deviate +/// in terms of ASCII casing, whitespace, underscores, and hyphens. +/// +/// # Example +/// +/// ``` +/// use icu::properties::GeneralCategory; +/// +/// let lookup = GeneralCategory::name_to_enum_mapper(); +/// // short name for value +/// assert_eq!( +/// lookup.get_strict("Lu"), +/// Some(GeneralCategory::UppercaseLetter) +/// ); +/// assert_eq!( +/// lookup.get_strict("Pd"), +/// Some(GeneralCategory::DashPunctuation) +/// ); +/// // long name for value +/// assert_eq!( +/// lookup.get_strict("Uppercase_Letter"), +/// Some(GeneralCategory::UppercaseLetter) +/// ); +/// assert_eq!( +/// lookup.get_strict("Dash_Punctuation"), +/// Some(GeneralCategory::DashPunctuation) +/// ); +/// // name has incorrect casing +/// assert_eq!(lookup.get_strict("dashpunctuation"), None); +/// // loose matching of name +/// assert_eq!( +/// lookup.get_loose("dash-punctuation"), +/// Some(GeneralCategory::DashPunctuation) +/// ); +/// // fake property +/// assert_eq!(lookup.get_strict("Animated_Gif"), None); +/// ``` +#[derive(Debug)] +pub struct PropertyValueNameToEnumMapper<T> { + map: DataPayload<ErasedNameToEnumMapV1Marker>, + markers: PhantomData<fn() -> T>, +} + +/// A borrowed wrapper around property value name-to-enum data, returned by +/// [`PropertyValueNameToEnumMapper::as_borrowed()`]. More efficient to query. +#[derive(Debug)] +pub struct PropertyValueNameToEnumMapperBorrowed<'a, T> { + map: &'a PropertyValueNameToEnumMapV1<'a>, + markers: PhantomData<fn() -> T>, +} + +impl<T: TrieValue> PropertyValueNameToEnumMapper<T> { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (like `get_strict()`) by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> PropertyValueNameToEnumMapperBorrowed<'_, T> { + PropertyValueNameToEnumMapperBorrowed { + map: self.map.get(), + markers: PhantomData, + } + } + + pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyValueNameToEnumMapV1<'static>>, + { + Self { + map: data.cast(), + markers: PhantomData, + } + } + + #[doc(hidden)] // used by FFI code + pub fn erase(self) -> PropertyValueNameToEnumMapper<u16> { + PropertyValueNameToEnumMapper { + map: self.map.cast(), + markers: PhantomData, + } + } +} + +impl<T: TrieValue> PropertyValueNameToEnumMapperBorrowed<'_, T> { + /// Get the property value as a u16, doing a strict search looking for + /// names that match exactly + /// + /// # Example + /// + /// ``` + /// use icu_properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::name_to_enum_mapper(); + /// assert_eq!( + /// lookup.get_strict_u16("Lu"), + /// Some(GeneralCategory::UppercaseLetter as u16) + /// ); + /// assert_eq!( + /// lookup.get_strict_u16("Uppercase_Letter"), + /// Some(GeneralCategory::UppercaseLetter as u16) + /// ); + /// // does not do loose matching + /// assert_eq!(lookup.get_strict_u16("UppercaseLetter"), None); + /// ``` + #[inline] + pub fn get_strict_u16(&self, name: &str) -> Option<u16> { + get_strict_u16(self.map, name) + } + + /// Get the property value as a `T`, doing a strict search looking for + /// names that match exactly + /// + /// # Example + /// + /// ``` + /// use icu_properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::name_to_enum_mapper(); + /// assert_eq!( + /// lookup.get_strict("Lu"), + /// Some(GeneralCategory::UppercaseLetter) + /// ); + /// assert_eq!( + /// lookup.get_strict("Uppercase_Letter"), + /// Some(GeneralCategory::UppercaseLetter) + /// ); + /// // does not do loose matching + /// assert_eq!(lookup.get_strict("UppercaseLetter"), None); + /// ``` + #[inline] + pub fn get_strict(&self, name: &str) -> Option<T> { + T::try_from_u32(self.get_strict_u16(name)? as u32).ok() + } + + /// Get the property value as a u16, doing a loose search looking for + /// names that match case-insensitively, ignoring ASCII hyphens, underscores, and + /// whitespaces. + /// + /// # Example + /// + /// ``` + /// use icu_properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::name_to_enum_mapper(); + /// assert_eq!( + /// lookup.get_loose_u16("Lu"), + /// Some(GeneralCategory::UppercaseLetter as u16) + /// ); + /// assert_eq!( + /// lookup.get_loose_u16("Uppercase_Letter"), + /// Some(GeneralCategory::UppercaseLetter as u16) + /// ); + /// // does do loose matching + /// assert_eq!( + /// lookup.get_loose_u16("UppercaseLetter"), + /// Some(GeneralCategory::UppercaseLetter as u16) + /// ); + /// ``` + #[inline] + pub fn get_loose_u16(&self, name: &str) -> Option<u16> { + get_loose_u16(self.map, name) + } + + /// Get the property value as a `T`, doing a loose search looking for + /// names that match case-insensitively, ignoring ASCII hyphens, underscores, and + /// whitespaces. + /// + /// # Example + /// + /// ``` + /// use icu_properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::name_to_enum_mapper(); + /// assert_eq!( + /// lookup.get_loose("Lu"), + /// Some(GeneralCategory::UppercaseLetter) + /// ); + /// assert_eq!( + /// lookup.get_loose("Uppercase_Letter"), + /// Some(GeneralCategory::UppercaseLetter) + /// ); + /// // does do loose matching + /// assert_eq!( + /// lookup.get_loose("UppercaseLetter"), + /// Some(GeneralCategory::UppercaseLetter) + /// ); + /// ``` + #[inline] + pub fn get_loose(&self, name: &str) -> Option<T> { + T::try_from_u32(self.get_loose_u16(name)? as u32).ok() + } +} + +impl<T: TrieValue> PropertyValueNameToEnumMapperBorrowed<'static, T> { + /// Cheaply converts a `PropertyValueNameToEnumMapperBorrowed<'static>` into a `PropertyValueNameToEnumMapper`. + pub const fn static_to_owned(self) -> PropertyValueNameToEnumMapper<T> { + PropertyValueNameToEnumMapper { + map: DataPayload::from_static_ref(self.map), + markers: PhantomData, + } + } +} + +/// Avoid monomorphizing multiple copies of this function +fn get_strict_u16(payload: &PropertyValueNameToEnumMapV1<'_>, name: &str) -> Option<u16> { + // NormalizedPropertyName has no invariants so this should be free, but + // avoid introducing a panic regardless + let name = NormalizedPropertyNameStr::parse_byte_slice(name.as_bytes()).ok()?; + payload.map.get_copied(name) +} + +/// Avoid monomorphizing multiple copies of this function +fn get_loose_u16(payload: &PropertyValueNameToEnumMapV1<'_>, name: &str) -> Option<u16> { + // NormalizedPropertyName has no invariants so this should be free, but + // avoid introducing a panic regardless + let name = NormalizedPropertyNameStr::parse_byte_slice(name.as_bytes()).ok()?; + payload.map.get_copied_by(|p| p.cmp_loose(name)) +} + +/// Private marker type for PropertyEnumToValueNameSparseMapper +/// to work for all properties at once +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) struct ErasedEnumToValueNameSparseMapV1Marker; +impl DataMarker for ErasedEnumToValueNameSparseMapV1Marker { + type Yokeable = PropertyEnumToValueNameSparseMapV1<'static>; +} + +/// A struct capable of looking up a property name from a value +/// Access its data by calling [`Self::as_borrowed()`] and using the methods on +/// [`PropertyEnumToValueNameSparseMapperBorrowed`]. +/// +/// This mapper is used for properties with sparse values, like [`CanonicalCombiningClass`]. +/// It may be obtained using methods like [`CanonicalCombiningClass::get_enum_to_long_name_mapper()`]. +/// +/// The name returned may be a short (`"KV"`) or long (`"Kana_Voicing"`) name, depending +/// on the constructor used. +/// +/// # Example +/// +/// ``` +/// use icu::properties::CanonicalCombiningClass; +/// +/// let lookup = CanonicalCombiningClass::enum_to_long_name_mapper(); +/// assert_eq!( +/// lookup.get(CanonicalCombiningClass::KanaVoicing), +/// Some("Kana_Voicing") +/// ); +/// assert_eq!( +/// lookup.get(CanonicalCombiningClass::AboveLeft), +/// Some("Above_Left") +/// ); +/// ``` +#[derive(Debug)] +pub struct PropertyEnumToValueNameSparseMapper<T> { + map: DataPayload<ErasedEnumToValueNameSparseMapV1Marker>, + markers: PhantomData<fn(T) -> ()>, +} + +/// A borrowed wrapper around property value name-to-enum data, returned by +/// [`PropertyEnumToValueNameSparseMapper::as_borrowed()`]. More efficient to query. +#[derive(Debug)] +pub struct PropertyEnumToValueNameSparseMapperBorrowed<'a, T> { + map: &'a PropertyEnumToValueNameSparseMapV1<'a>, + markers: PhantomData<fn(T) -> ()>, +} + +impl<T: TrieValue> PropertyEnumToValueNameSparseMapper<T> { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> PropertyEnumToValueNameSparseMapperBorrowed<'_, T> { + PropertyEnumToValueNameSparseMapperBorrowed { + map: self.map.get(), + markers: PhantomData, + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use methods on individual property value types + /// (like [`Script::TBD()`]) instead. + pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyEnumToValueNameSparseMapV1<'static>>, + { + Self { + map: data.cast(), + markers: PhantomData, + } + } +} + +impl<T: TrieValue> PropertyEnumToValueNameSparseMapperBorrowed<'_, T> { + /// Get the property name given a value + /// + /// # Example + /// + /// ```rust + /// use icu::properties::CanonicalCombiningClass; + /// + /// let lookup = CanonicalCombiningClass::enum_to_long_name_mapper(); + /// assert_eq!( + /// lookup.get(CanonicalCombiningClass::KanaVoicing), + /// Some("Kana_Voicing") + /// ); + /// assert_eq!( + /// lookup.get(CanonicalCombiningClass::AboveLeft), + /// Some("Above_Left") + /// ); + /// ``` + #[inline] + pub fn get(&self, property: T) -> Option<&str> { + let prop = u16::try_from(property.to_u32()).ok()?; + self.map.map.get(&prop) + } +} + +impl<T: TrieValue> PropertyEnumToValueNameSparseMapperBorrowed<'static, T> { + /// Cheaply converts a `PropertyEnumToValueNameSparseMapperBorrowed<'static>` into a `PropertyEnumToValueNameSparseMapper`. + pub const fn static_to_owned(self) -> PropertyEnumToValueNameSparseMapper<T> { + PropertyEnumToValueNameSparseMapper { + map: DataPayload::from_static_ref(self.map), + markers: PhantomData, + } + } +} + +/// Private marker type for PropertyEnumToValueNameLinearMapper +/// to work for all properties at once +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) struct ErasedEnumToValueNameLinearMapV1Marker; +impl DataMarker for ErasedEnumToValueNameLinearMapV1Marker { + type Yokeable = PropertyEnumToValueNameLinearMapV1<'static>; +} + +/// A struct capable of looking up a property name from a value +/// Access its data by calling [`Self::as_borrowed()`] and using the methods on +/// [`PropertyEnumToValueNameLinearMapperBorrowed`]. +/// +/// This mapper is used for properties with sequential values, like [`GeneralCategory`]. +/// It may be obtained using methods like [`GeneralCategory::get_enum_to_long_name_mapper()`]. +/// +/// The name returned may be a short (`"Lu"`) or long (`"Uppercase_Letter"`) name, depending +/// on the constructor used. +/// +/// # Example +/// +/// ``` +/// use icu::properties::GeneralCategory; +/// +/// let lookup = GeneralCategory::enum_to_long_name_mapper(); +/// assert_eq!( +/// lookup.get(GeneralCategory::UppercaseLetter), +/// Some("Uppercase_Letter") +/// ); +/// assert_eq!( +/// lookup.get(GeneralCategory::DashPunctuation), +/// Some("Dash_Punctuation") +/// ); +/// ``` +#[derive(Debug)] +pub struct PropertyEnumToValueNameLinearMapper<T> { + map: DataPayload<ErasedEnumToValueNameLinearMapV1Marker>, + markers: PhantomData<fn(T) -> ()>, +} + +/// A borrowed wrapper around property value name-to-enum data, returned by +/// [`PropertyEnumToValueNameLinearMapper::as_borrowed()`]. More efficient to query. +#[derive(Debug)] +pub struct PropertyEnumToValueNameLinearMapperBorrowed<'a, T> { + map: &'a PropertyEnumToValueNameLinearMapV1<'a>, + markers: PhantomData<fn(T) -> ()>, +} + +impl<T: TrieValue> PropertyEnumToValueNameLinearMapper<T> { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> PropertyEnumToValueNameLinearMapperBorrowed<'_, T> { + PropertyEnumToValueNameLinearMapperBorrowed { + map: self.map.get(), + markers: PhantomData, + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use methods on individual property value types + /// (like [`Script::TBD()`]) instead. + pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyEnumToValueNameLinearMapV1<'static>>, + { + Self { + map: data.cast(), + markers: PhantomData, + } + } +} + +impl<T: TrieValue> PropertyEnumToValueNameLinearMapperBorrowed<'_, T> { + /// Get the property name given a value + /// + /// # Example + /// + /// ```rust + /// use icu::properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(GeneralCategory::UppercaseLetter), Some("Lu")); + /// assert_eq!(lookup.get(GeneralCategory::DashPunctuation), Some("Pd")); + /// ``` + #[inline] + pub fn get(&self, property: T) -> Option<&str> { + let prop = usize::try_from(property.to_u32()).ok()?; + self.map.map.get(prop).filter(|x| !x.is_empty()) + } +} + +impl<T: TrieValue> PropertyEnumToValueNameLinearMapperBorrowed<'static, T> { + /// Cheaply converts a `PropertyEnumToValueNameLinearMapperBorrowed<'static>` into a `PropertyEnumToValueNameLinearMapper`. + pub const fn static_to_owned(self) -> PropertyEnumToValueNameLinearMapper<T> { + PropertyEnumToValueNameLinearMapper { + map: DataPayload::from_static_ref(self.map), + markers: PhantomData, + } + } +} + +/// Private marker type for PropertyEnumToValueNameLinearTiny4Mapper +/// to work for all properties at once +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) struct ErasedEnumToValueNameLinearTiny4MapV1Marker; +impl DataMarker for ErasedEnumToValueNameLinearTiny4MapV1Marker { + type Yokeable = PropertyEnumToValueNameLinearTiny4MapV1<'static>; +} + +/// A struct capable of looking up a property name from a value +/// Access its data by calling [`Self::as_borrowed()`] and using the methods on +/// [`PropertyEnumToValueNameLinearTiny4MapperBorrowed`]. +/// +/// This mapper is used for properties with sequential values and names with four or fewer characters, +/// like the [`Script`] short names. +/// It may be obtained using methods like [`Script::get_enum_to_short_name_mapper()`]. +/// +/// # Example +/// +/// ``` +/// use icu::properties::Script; +/// use tinystr::tinystr; +/// +/// let lookup = Script::enum_to_short_name_mapper(); +/// assert_eq!(lookup.get(Script::Brahmi), Some(tinystr!(4, "Brah"))); +/// assert_eq!(lookup.get(Script::Hangul), Some(tinystr!(4, "Hang"))); +/// ``` +#[derive(Debug)] +pub struct PropertyEnumToValueNameLinearTiny4Mapper<T> { + map: DataPayload<ErasedEnumToValueNameLinearTiny4MapV1Marker>, + markers: PhantomData<fn(T) -> ()>, +} + +/// A borrowed wrapper around property value name-to-enum data, returned by +/// [`PropertyEnumToValueNameLinearTiny4Mapper::as_borrowed()`]. More efficient to query. +#[derive(Debug)] +pub struct PropertyEnumToValueNameLinearTiny4MapperBorrowed<'a, T> { + map: &'a PropertyEnumToValueNameLinearTiny4MapV1<'a>, + markers: PhantomData<fn(T) -> ()>, +} + +impl<T: TrieValue> PropertyEnumToValueNameLinearTiny4Mapper<T> { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> PropertyEnumToValueNameLinearTiny4MapperBorrowed<'_, T> { + PropertyEnumToValueNameLinearTiny4MapperBorrowed { + map: self.map.get(), + markers: PhantomData, + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use methods on individual property value types + /// (like [`Script::TBD()`]) instead. + pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyEnumToValueNameLinearTiny4MapV1<'static>>, + { + Self { + map: data.cast(), + markers: PhantomData, + } + } +} + +impl<T: TrieValue> PropertyEnumToValueNameLinearTiny4MapperBorrowed<'_, T> { + /// Get the property name given a value + /// + /// # Example + /// + /// ```rust + /// use icu::properties::Script; + /// use tinystr::tinystr; + /// + /// let lookup = Script::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(Script::Brahmi), Some(tinystr!(4, "Brah"))); + /// assert_eq!(lookup.get(Script::Hangul), Some(tinystr!(4, "Hang"))); + /// ``` + #[inline] + pub fn get(&self, property: T) -> Option<tinystr::TinyStr4> { + let prop = usize::try_from(property.to_u32()).ok()?; + self.map.map.get(prop).filter(|x| !x.is_empty()) + } +} + +impl<T: TrieValue> PropertyEnumToValueNameLinearTiny4MapperBorrowed<'static, T> { + /// Cheaply converts a `PropertyEnumToValueNameLinearTiny4MapperBorrowed<'static>` into a `PropertyEnumToValueNameLinearTiny4Mapper`. + pub const fn static_to_owned(self) -> PropertyEnumToValueNameLinearTiny4Mapper<T> { + PropertyEnumToValueNameLinearTiny4Mapper { + map: DataPayload::from_static_ref(self.map), + markers: PhantomData, + } + } +} + +macro_rules! impl_value_getter { + ( + // the marker type for names lookup (name_to_enum, enum_to_short_name, enum_to_long_name) + markers: $marker_n2e:ident / $singleton_n2e:ident $(, $marker_e2sn:ident / $singleton_e2sn:ident, $marker_e2ln:ident / $singleton_e2ln:ident)?; + impl $ty:ident { + $(#[$attr_n2e:meta])* + $vis_n2e:vis fn $name_n2e:ident() / $cname_n2e:ident(); + $( + + $(#[$attr_e2sn:meta])* + $vis_e2sn:vis fn $name_e2sn:ident() / $cname_e2sn:ident() -> $mapper_e2sn:ident / $mapper_e2snb:ident; + $(#[$attr_e2ln:meta])* + $vis_e2ln:vis fn $name_e2ln:ident() / $cname_e2ln:ident() -> $mapper_e2ln:ident / $mapper_e2lnb:ident; + )? + } + ) => { + impl $ty { + $(#[$attr_n2e])* + #[cfg(feature = "compiled_data")] + $vis_n2e fn $cname_n2e() -> PropertyValueNameToEnumMapperBorrowed<'static, $ty> { + PropertyValueNameToEnumMapperBorrowed { + map: crate::provider::Baked::$singleton_n2e, + markers: PhantomData, + } + } + + #[doc = concat!("A version of [`", stringify!($ty), "::", stringify!($cname_n2e), "()`] that uses custom data provided by a [`DataProvider`].")] + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + $vis_n2e fn $name_n2e( + provider: &(impl DataProvider<$marker_n2e> + ?Sized) + ) -> Result<PropertyValueNameToEnumMapper<$ty>, PropertiesError> { + Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(PropertyValueNameToEnumMapper::from_data)?) + } + + $( + $(#[$attr_e2sn])* + #[cfg(feature = "compiled_data")] + $vis_e2sn fn $cname_e2sn() -> $mapper_e2snb<'static, $ty> { + $mapper_e2snb { + map: crate::provider::Baked::$singleton_e2sn, + markers: PhantomData, + } + } + + #[doc = concat!("A version of [`", stringify!($ty), "::", stringify!($cname_e2sn), "()`] that uses custom data provided by a [`DataProvider`].")] + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + $vis_e2sn fn $name_e2sn( + provider: &(impl DataProvider<$marker_e2sn> + ?Sized) + ) -> Result<$mapper_e2sn<$ty>, PropertiesError> { + Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map($mapper_e2sn::from_data)?) + } + + $(#[$attr_e2ln])* + #[cfg(feature = "compiled_data")] + $vis_e2ln fn $cname_e2ln() -> $mapper_e2lnb<'static, $ty> { + $mapper_e2lnb { + map: crate::provider::Baked::$singleton_e2ln, + markers: PhantomData, + } + } + + #[doc = concat!("A version of [`", stringify!($ty), "::", stringify!($cname_e2ln), "()`] that uses custom data provided by a [`DataProvider`].")] + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + $vis_e2ln fn $name_e2ln( + provider: &(impl DataProvider<$marker_e2ln> + ?Sized) + ) -> Result<$mapper_e2ln<$ty>, PropertiesError> { + Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map($mapper_e2ln::from_data)?) + } + )? + } + } +} + +/// Enumerated property Bidi_Class +/// +/// These are the categories required by the Unicode Bidirectional Algorithm. +/// For the property values, see [Bidirectional Class Values](https://unicode.org/reports/tr44/#Bidi_Class_Values). +/// For more information, see [Unicode Standard Annex #9](https://unicode.org/reports/tr41/tr41-28.html#UAX9). +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(BidiClassULE)] +pub struct BidiClass(pub u8); + +#[allow(non_upper_case_globals)] +impl BidiClass { + /// (`L`) any strong left-to-right character + pub const LeftToRight: BidiClass = BidiClass(0); + /// (`R`) any strong right-to-left (non-Arabic-type) character + pub const RightToLeft: BidiClass = BidiClass(1); + /// (`EN`) any ASCII digit or Eastern Arabic-Indic digit + pub const EuropeanNumber: BidiClass = BidiClass(2); + /// (`ES`) plus and minus signs + pub const EuropeanSeparator: BidiClass = BidiClass(3); + /// (`ET`) a terminator in a numeric format context, includes currency signs + pub const EuropeanTerminator: BidiClass = BidiClass(4); + /// (`AN`) any Arabic-Indic digit + pub const ArabicNumber: BidiClass = BidiClass(5); + /// (`CS`) commas, colons, and slashes + pub const CommonSeparator: BidiClass = BidiClass(6); + /// (`B`) various newline characters + pub const ParagraphSeparator: BidiClass = BidiClass(7); + /// (`S`) various segment-related control codes + pub const SegmentSeparator: BidiClass = BidiClass(8); + /// (`WS`) spaces + pub const WhiteSpace: BidiClass = BidiClass(9); + /// (`ON`) most other symbols and punctuation marks + pub const OtherNeutral: BidiClass = BidiClass(10); + /// (`LRE`) U+202A: the LR embedding control + pub const LeftToRightEmbedding: BidiClass = BidiClass(11); + /// (`LRO`) U+202D: the LR override control + pub const LeftToRightOverride: BidiClass = BidiClass(12); + /// (`AL`) any strong right-to-left (Arabic-type) character + pub const ArabicLetter: BidiClass = BidiClass(13); + /// (`RLE`) U+202B: the RL embedding control + pub const RightToLeftEmbedding: BidiClass = BidiClass(14); + /// (`RLO`) U+202E: the RL override control + pub const RightToLeftOverride: BidiClass = BidiClass(15); + /// (`PDF`) U+202C: terminates an embedding or override control + pub const PopDirectionalFormat: BidiClass = BidiClass(16); + /// (`NSM`) any nonspacing mark + pub const NonspacingMark: BidiClass = BidiClass(17); + /// (`BN`) most format characters, control codes, or noncharacters + pub const BoundaryNeutral: BidiClass = BidiClass(18); + /// (`FSI`) U+2068: the first strong isolate control + pub const FirstStrongIsolate: BidiClass = BidiClass(19); + /// (`LRI`) U+2066: the LR isolate control + pub const LeftToRightIsolate: BidiClass = BidiClass(20); + /// (`RLI`) U+2067: the RL isolate control + pub const RightToLeftIsolate: BidiClass = BidiClass(21); + /// (`PDI`) U+2069: terminates an isolate control + pub const PopDirectionalIsolate: BidiClass = BidiClass(22); +} + +impl_value_getter! { + markers: BidiClassNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_BC_V1, BidiClassValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_BC_V1, BidiClassValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_BC_V1; + impl BidiClass { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Bidi_Class` enumerated property + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::BidiClass; + /// + /// let lookup = BidiClass::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("AN"), Some(BidiClass::ArabicNumber)); + /// assert_eq!(lookup.get_strict("NSM"), Some(BidiClass::NonspacingMark)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Arabic_Number"), Some(BidiClass::ArabicNumber)); + /// assert_eq!(lookup.get_strict("Nonspacing_Mark"), Some(BidiClass::NonspacingMark)); + /// // name has incorrect casing + /// assert_eq!(lookup.get_strict("arabicnumber"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("arabicnumber"), Some(BidiClass::ArabicNumber)); + /// // fake property + /// assert_eq!(lookup.get_strict("Upside_Down_Vertical_Backwards_Mirrored"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Bidi_Class` enumerated property + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::BidiClass; + /// + /// let lookup = BidiClass::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(BidiClass::ArabicNumber), Some("AN")); + /// assert_eq!(lookup.get(BidiClass::NonspacingMark), Some("NSM")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `Bidi_Class` enumerated property + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::BidiClass; + /// + /// let lookup = BidiClass::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(BidiClass::ArabicNumber), Some("Arabic_Number")); + /// assert_eq!(lookup.get(BidiClass::NonspacingMark), Some("Nonspacing_Mark")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +/// Enumerated property General_Category. +/// +/// General_Category specifies the most general classification of a code point, usually +/// determined based on the primary characteristic of the assigned character. For example, is the +/// character a letter, a mark, a number, punctuation, or a symbol, and if so, of what type? +/// +/// GeneralCategory only supports specific subcategories (eg `UppercaseLetter`). +/// It does not support grouped categories (eg `Letter`). For grouped categories, use [`GeneralCategoryGroup`]. +#[derive(Copy, Clone, PartialEq, Eq, Debug, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_enums)] // this type is stable +#[zerovec::make_ule(GeneralCategoryULE)] +#[repr(u8)] +pub enum GeneralCategory { + /// (`Cn`) A reserved unassigned code point or a noncharacter + Unassigned = 0, + + /// (`Lu`) An uppercase letter + UppercaseLetter = 1, + /// (`Ll`) A lowercase letter + LowercaseLetter = 2, + /// (`Lt`) A digraphic letter, with first part uppercase + TitlecaseLetter = 3, + /// (`Lm`) A modifier letter + ModifierLetter = 4, + /// (`Lo`) Other letters, including syllables and ideographs + OtherLetter = 5, + + /// (`Mn`) A nonspacing combining mark (zero advance width) + NonspacingMark = 6, + /// (`Mc`) A spacing combining mark (positive advance width) + SpacingMark = 8, + /// (`Me`) An enclosing combining mark + EnclosingMark = 7, + + /// (`Nd`) A decimal digit + DecimalNumber = 9, + /// (`Nl`) A letterlike numeric character + LetterNumber = 10, + /// (`No`) A numeric character of other type + OtherNumber = 11, + + /// (`Zs`) A space character (of various non-zero widths) + SpaceSeparator = 12, + /// (`Zl`) U+2028 LINE SEPARATOR only + LineSeparator = 13, + /// (`Zp`) U+2029 PARAGRAPH SEPARATOR only + ParagraphSeparator = 14, + + /// (`Cc`) A C0 or C1 control code + Control = 15, + /// (`Cf`) A format control character + Format = 16, + /// (`Co`) A private-use character + PrivateUse = 17, + /// (`Cs`) A surrogate code point + Surrogate = 18, + + /// (`Pd`) A dash or hyphen punctuation mark + DashPunctuation = 19, + /// (`Ps`) An opening punctuation mark (of a pair) + OpenPunctuation = 20, + /// (`Pe`) A closing punctuation mark (of a pair) + ClosePunctuation = 21, + /// (`Pc`) A connecting punctuation mark, like a tie + ConnectorPunctuation = 22, + /// (`Pi`) An initial quotation mark + InitialPunctuation = 28, + /// (`Pf`) A final quotation mark + FinalPunctuation = 29, + /// (`Po`) A punctuation mark of other type + OtherPunctuation = 23, + + /// (`Sm`) A symbol of mathematical use + MathSymbol = 24, + /// (`Sc`) A currency sign + CurrencySymbol = 25, + /// (`Sk`) A non-letterlike modifier symbol + ModifierSymbol = 26, + /// (`So`) A symbol of other type + OtherSymbol = 27, +} + +impl_value_getter! { + markers: GeneralCategoryNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_GC_V1, GeneralCategoryValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_GC_V1, GeneralCategoryValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_GC_V1; + impl GeneralCategory { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `General_Category` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("Lu"), Some(GeneralCategory::UppercaseLetter)); + /// assert_eq!(lookup.get_strict("Pd"), Some(GeneralCategory::DashPunctuation)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Uppercase_Letter"), Some(GeneralCategory::UppercaseLetter)); + /// assert_eq!(lookup.get_strict("Dash_Punctuation"), Some(GeneralCategory::DashPunctuation)); + /// // name has incorrect casing + /// assert_eq!(lookup.get_strict("dashpunctuation"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("dash-punctuation"), Some(GeneralCategory::DashPunctuation)); + /// // fake property + /// assert_eq!(lookup.get_loose("Animated_Gif"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `General_Category` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(GeneralCategory::UppercaseLetter), Some("Lu")); + /// assert_eq!(lookup.get(GeneralCategory::DashPunctuation), Some("Pd")); + /// assert_eq!(lookup.get(GeneralCategory::FinalPunctuation), Some("Pf")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `General_Category` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GeneralCategory; + /// + /// let lookup = GeneralCategory::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(GeneralCategory::UppercaseLetter), Some("Uppercase_Letter")); + /// assert_eq!(lookup.get(GeneralCategory::DashPunctuation), Some("Dash_Punctuation")); + /// assert_eq!(lookup.get(GeneralCategory::FinalPunctuation), Some("Final_Punctuation")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash, Default)] +pub struct GeneralCategoryTryFromError; + +impl TryFrom<u8> for GeneralCategory { + type Error = GeneralCategoryTryFromError; + /// Construct this [`GeneralCategory`] from an integer, returning + /// an error if it is out of bounds + fn try_from(val: u8) -> Result<Self, GeneralCategoryTryFromError> { + GeneralCategory::new_from_u8(val).ok_or(GeneralCategoryTryFromError) + } +} + +/// Groupings of multiple General_Category property values. +/// +/// Instances of `GeneralCategoryGroup` represent the defined multi-category +/// values that are useful for users in certain contexts, such as regex. In +/// other words, unlike [`GeneralCategory`], this supports groups of general +/// categories: for example, `Letter` /// is the union of `UppercaseLetter`, +/// `LowercaseLetter`, etc. +/// +/// See <https://www.unicode.org/reports/tr44/> . +/// +/// The discriminants correspond to the `U_GC_XX_MASK` constants in ICU4C. +/// Unlike [`GeneralCategory`], this supports groups of general categories: for example, `Letter` +/// is the union of `UppercaseLetter`, `LowercaseLetter`, etc. +/// +/// See `UCharCategory` and `U_GET_GC_MASK` in ICU4C. +#[derive(Copy, Clone, PartialEq, Debug, Eq)] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +pub struct GeneralCategoryGroup(pub(crate) u32); + +use GeneralCategory as GC; +use GeneralCategoryGroup as GCG; + +#[allow(non_upper_case_globals)] +impl GeneralCategoryGroup { + /// (`Lu`) An uppercase letter + pub const UppercaseLetter: GeneralCategoryGroup = GCG(1 << (GC::UppercaseLetter as u32)); + /// (`Ll`) A lowercase letter + pub const LowercaseLetter: GeneralCategoryGroup = GCG(1 << (GC::LowercaseLetter as u32)); + /// (`Lt`) A digraphic letter, with first part uppercase + pub const TitlecaseLetter: GeneralCategoryGroup = GCG(1 << (GC::TitlecaseLetter as u32)); + /// (`Lm`) A modifier letter + pub const ModifierLetter: GeneralCategoryGroup = GCG(1 << (GC::ModifierLetter as u32)); + /// (`Lo`) Other letters, including syllables and ideographs + pub const OtherLetter: GeneralCategoryGroup = GCG(1 << (GC::OtherLetter as u32)); + /// (`LC`) The union of UppercaseLetter, LowercaseLetter, and TitlecaseLetter + pub const CasedLetter: GeneralCategoryGroup = GCG(1 << (GC::UppercaseLetter as u32) + | 1 << (GC::LowercaseLetter as u32) + | 1 << (GC::TitlecaseLetter as u32)); + /// (`L`) The union of all letter categories + pub const Letter: GeneralCategoryGroup = GCG(1 << (GC::UppercaseLetter as u32) + | 1 << (GC::LowercaseLetter as u32) + | 1 << (GC::TitlecaseLetter as u32) + | 1 << (GC::ModifierLetter as u32) + | 1 << (GC::OtherLetter as u32)); + + /// (`Mn`) A nonspacing combining mark (zero advance width) + pub const NonspacingMark: GeneralCategoryGroup = GCG(1 << (GC::NonspacingMark as u32)); + /// (`Mc`) A spacing combining mark (positive advance width) + pub const EnclosingMark: GeneralCategoryGroup = GCG(1 << (GC::EnclosingMark as u32)); + /// (`Me`) An enclosing combining mark + pub const SpacingMark: GeneralCategoryGroup = GCG(1 << (GC::SpacingMark as u32)); + /// (`M`) The union of all mark categories + pub const Mark: GeneralCategoryGroup = GCG(1 << (GC::NonspacingMark as u32) + | 1 << (GC::EnclosingMark as u32) + | 1 << (GC::SpacingMark as u32)); + + /// (`Nd`) A decimal digit + pub const DecimalNumber: GeneralCategoryGroup = GCG(1 << (GC::DecimalNumber as u32)); + /// (`Nl`) A letterlike numeric character + pub const LetterNumber: GeneralCategoryGroup = GCG(1 << (GC::LetterNumber as u32)); + /// (`No`) A numeric character of other type + pub const OtherNumber: GeneralCategoryGroup = GCG(1 << (GC::OtherNumber as u32)); + /// (`N`) The union of all number categories + pub const Number: GeneralCategoryGroup = GCG(1 << (GC::DecimalNumber as u32) + | 1 << (GC::LetterNumber as u32) + | 1 << (GC::OtherNumber as u32)); + + /// (`Zs`) A space character (of various non-zero widths) + pub const SpaceSeparator: GeneralCategoryGroup = GCG(1 << (GC::SpaceSeparator as u32)); + /// (`Zl`) U+2028 LINE SEPARATOR only + pub const LineSeparator: GeneralCategoryGroup = GCG(1 << (GC::LineSeparator as u32)); + /// (`Zp`) U+2029 PARAGRAPH SEPARATOR only + pub const ParagraphSeparator: GeneralCategoryGroup = GCG(1 << (GC::ParagraphSeparator as u32)); + /// (`Z`) The union of all separator categories + pub const Separator: GeneralCategoryGroup = GCG(1 << (GC::SpaceSeparator as u32) + | 1 << (GC::LineSeparator as u32) + | 1 << (GC::ParagraphSeparator as u32)); + + /// (`Cc`) A C0 or C1 control code + pub const Control: GeneralCategoryGroup = GCG(1 << (GC::Control as u32)); + /// (`Cf`) A format control character + pub const Format: GeneralCategoryGroup = GCG(1 << (GC::Format as u32)); + /// (`Co`) A private-use character + pub const PrivateUse: GeneralCategoryGroup = GCG(1 << (GC::PrivateUse as u32)); + /// (`Cs`) A surrogate code point + pub const Surrogate: GeneralCategoryGroup = GCG(1 << (GC::Surrogate as u32)); + /// (`Cn`) A reserved unassigned code point or a noncharacter + pub const Unassigned: GeneralCategoryGroup = GCG(1 << (GC::Unassigned as u32)); + /// (`C`) The union of all control code, reserved, and unassigned categories + pub const Other: GeneralCategoryGroup = GCG(1 << (GC::Control as u32) + | 1 << (GC::Format as u32) + | 1 << (GC::PrivateUse as u32) + | 1 << (GC::Surrogate as u32) + | 1 << (GC::Unassigned as u32)); + + /// (`Pd`) A dash or hyphen punctuation mark + pub const DashPunctuation: GeneralCategoryGroup = GCG(1 << (GC::DashPunctuation as u32)); + /// (`Ps`) An opening punctuation mark (of a pair) + pub const OpenPunctuation: GeneralCategoryGroup = GCG(1 << (GC::OpenPunctuation as u32)); + /// (`Pe`) A closing punctuation mark (of a pair) + pub const ClosePunctuation: GeneralCategoryGroup = GCG(1 << (GC::ClosePunctuation as u32)); + /// (`Pc`) A connecting punctuation mark, like a tie + pub const ConnectorPunctuation: GeneralCategoryGroup = + GCG(1 << (GC::ConnectorPunctuation as u32)); + /// (`Pi`) An initial quotation mark + pub const InitialPunctuation: GeneralCategoryGroup = GCG(1 << (GC::InitialPunctuation as u32)); + /// (`Pf`) A final quotation mark + pub const FinalPunctuation: GeneralCategoryGroup = GCG(1 << (GC::FinalPunctuation as u32)); + /// (`Po`) A punctuation mark of other type + pub const OtherPunctuation: GeneralCategoryGroup = GCG(1 << (GC::OtherPunctuation as u32)); + /// (`P`) The union of all punctuation categories + pub const Punctuation: GeneralCategoryGroup = GCG(1 << (GC::DashPunctuation as u32) + | 1 << (GC::OpenPunctuation as u32) + | 1 << (GC::ClosePunctuation as u32) + | 1 << (GC::ConnectorPunctuation as u32) + | 1 << (GC::OtherPunctuation as u32) + | 1 << (GC::InitialPunctuation as u32) + | 1 << (GC::FinalPunctuation as u32)); + + /// (`Sm`) A symbol of mathematical use + pub const MathSymbol: GeneralCategoryGroup = GCG(1 << (GC::MathSymbol as u32)); + /// (`Sc`) A currency sign + pub const CurrencySymbol: GeneralCategoryGroup = GCG(1 << (GC::CurrencySymbol as u32)); + /// (`Sk`) A non-letterlike modifier symbol + pub const ModifierSymbol: GeneralCategoryGroup = GCG(1 << (GC::ModifierSymbol as u32)); + /// (`So`) A symbol of other type + pub const OtherSymbol: GeneralCategoryGroup = GCG(1 << (GC::OtherSymbol as u32)); + /// (`S`) The union of all symbol categories + pub const Symbol: GeneralCategoryGroup = GCG(1 << (GC::MathSymbol as u32) + | 1 << (GC::CurrencySymbol as u32) + | 1 << (GC::ModifierSymbol as u32) + | 1 << (GC::OtherSymbol as u32)); + + const ALL: u32 = (1 << (GC::FinalPunctuation as u32 + 1)) - 1; + + /// Return whether the code point belongs in the provided multi-value category. + /// + /// ``` + /// use icu::properties::{maps, GeneralCategory, GeneralCategoryGroup}; + /// use icu_collections::codepointtrie::CodePointTrie; + /// + /// let gc = maps::general_category(); + /// + /// assert_eq!(gc.get('A'), GeneralCategory::UppercaseLetter); + /// assert!(GeneralCategoryGroup::CasedLetter.contains(gc.get('A'))); + /// + /// // U+0B1E ORIYA LETTER NYA + /// assert_eq!(gc.get('ଞ'), GeneralCategory::OtherLetter); + /// assert!(GeneralCategoryGroup::Letter.contains(gc.get('ଞ'))); + /// assert!(!GeneralCategoryGroup::CasedLetter.contains(gc.get('ଞ'))); + /// + /// // U+0301 COMBINING ACUTE ACCENT + /// assert_eq!(gc.get32(0x0301), GeneralCategory::NonspacingMark); + /// assert!(GeneralCategoryGroup::Mark.contains(gc.get32(0x0301))); + /// assert!(!GeneralCategoryGroup::Letter.contains(gc.get32(0x0301))); + /// + /// assert_eq!(gc.get('0'), GeneralCategory::DecimalNumber); + /// assert!(GeneralCategoryGroup::Number.contains(gc.get('0'))); + /// assert!(!GeneralCategoryGroup::Mark.contains(gc.get('0'))); + /// + /// assert_eq!(gc.get('('), GeneralCategory::OpenPunctuation); + /// assert!(GeneralCategoryGroup::Punctuation.contains(gc.get('('))); + /// assert!(!GeneralCategoryGroup::Number.contains(gc.get('('))); + /// + /// // U+2713 CHECK MARK + /// assert_eq!(gc.get('✓'), GeneralCategory::OtherSymbol); + /// assert!(GeneralCategoryGroup::Symbol.contains(gc.get('✓'))); + /// assert!(!GeneralCategoryGroup::Punctuation.contains(gc.get('✓'))); + /// + /// assert_eq!(gc.get(' '), GeneralCategory::SpaceSeparator); + /// assert!(GeneralCategoryGroup::Separator.contains(gc.get(' '))); + /// assert!(!GeneralCategoryGroup::Symbol.contains(gc.get(' '))); + /// + /// // U+E007F CANCEL TAG + /// assert_eq!(gc.get32(0xE007F), GeneralCategory::Format); + /// assert!(GeneralCategoryGroup::Other.contains(gc.get32(0xE007F))); + /// assert!(!GeneralCategoryGroup::Separator.contains(gc.get32(0xE007F))); + /// ``` + pub const fn contains(&self, val: GeneralCategory) -> bool { + 0 != (1 << (val as u32)) & self.0 + } + + /// Produce a GeneralCategoryGroup that is the inverse of this one + /// + /// # Example + /// + /// ```rust + /// use icu::properties::{GeneralCategory, GeneralCategoryGroup}; + /// + /// let letter = GeneralCategoryGroup::Letter; + /// let not_letter = letter.complement(); + /// + /// assert!(not_letter.contains(GeneralCategory::MathSymbol)); + /// assert!(!letter.contains(GeneralCategory::MathSymbol)); + /// assert!(not_letter.contains(GeneralCategory::OtherPunctuation)); + /// assert!(!letter.contains(GeneralCategory::OtherPunctuation)); + /// assert!(!not_letter.contains(GeneralCategory::UppercaseLetter)); + /// assert!(letter.contains(GeneralCategory::UppercaseLetter)); + /// ``` + pub const fn complement(self) -> Self { + // Mask off things not in Self::ALL to guarantee the mask + // values stay in-range + GeneralCategoryGroup(!self.0 & Self::ALL) + } + + /// Return the group representing all GeneralCategory values + /// + /// # Example + /// + /// ```rust + /// use icu::properties::{GeneralCategory, GeneralCategoryGroup}; + /// + /// let all = GeneralCategoryGroup::all(); + /// + /// assert!(all.contains(GeneralCategory::MathSymbol)); + /// assert!(all.contains(GeneralCategory::OtherPunctuation)); + /// assert!(all.contains(GeneralCategory::UppercaseLetter)); + /// ``` + pub const fn all() -> Self { + Self(Self::ALL) + } + + /// Return the empty group + /// + /// # Example + /// + /// ```rust + /// use icu::properties::{GeneralCategory, GeneralCategoryGroup}; + /// + /// let empty = GeneralCategoryGroup::empty(); + /// + /// assert!(!empty.contains(GeneralCategory::MathSymbol)); + /// assert!(!empty.contains(GeneralCategory::OtherPunctuation)); + /// assert!(!empty.contains(GeneralCategory::UppercaseLetter)); + /// ``` + pub const fn empty() -> Self { + Self(0) + } + + /// Take the union of two groups + /// + /// # Example + /// + /// ```rust + /// use icu::properties::{GeneralCategory, GeneralCategoryGroup}; + /// + /// let letter = GeneralCategoryGroup::Letter; + /// let symbol = GeneralCategoryGroup::Symbol; + /// let union = letter.union(symbol); + /// + /// assert!(union.contains(GeneralCategory::MathSymbol)); + /// assert!(!union.contains(GeneralCategory::OtherPunctuation)); + /// assert!(union.contains(GeneralCategory::UppercaseLetter)); + /// ``` + pub const fn union(self, other: Self) -> Self { + Self(self.0 | other.0) + } + + /// Take the intersection of two groups + /// + /// # Example + /// + /// ```rust + /// use icu::properties::{GeneralCategory, GeneralCategoryGroup}; + /// + /// let letter = GeneralCategoryGroup::Letter; + /// let lu = GeneralCategoryGroup::UppercaseLetter; + /// let intersection = letter.intersection(lu); + /// + /// assert!(!intersection.contains(GeneralCategory::MathSymbol)); + /// assert!(!intersection.contains(GeneralCategory::OtherPunctuation)); + /// assert!(intersection.contains(GeneralCategory::UppercaseLetter)); + /// assert!(!intersection.contains(GeneralCategory::LowercaseLetter)); + /// ``` + pub const fn intersection(self, other: Self) -> Self { + Self(self.0 & other.0) + } +} + +impl_value_getter! { + markers: GeneralCategoryMaskNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_GCM_V1; + impl GeneralCategoryGroup { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `General_Category_Mask` mask property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GeneralCategoryGroup; + /// + /// let lookup = GeneralCategoryGroup::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("L"), Some(GeneralCategoryGroup::Letter)); + /// assert_eq!(lookup.get_strict("LC"), Some(GeneralCategoryGroup::CasedLetter)); + /// assert_eq!(lookup.get_strict("Lu"), Some(GeneralCategoryGroup::UppercaseLetter)); + /// assert_eq!(lookup.get_strict("Zp"), Some(GeneralCategoryGroup::ParagraphSeparator)); + /// assert_eq!(lookup.get_strict("P"), Some(GeneralCategoryGroup::Punctuation)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Letter"), Some(GeneralCategoryGroup::Letter)); + /// assert_eq!(lookup.get_strict("Cased_Letter"), Some(GeneralCategoryGroup::CasedLetter)); + /// assert_eq!(lookup.get_strict("Uppercase_Letter"), Some(GeneralCategoryGroup::UppercaseLetter)); + /// // alias name + /// assert_eq!(lookup.get_strict("punct"), Some(GeneralCategoryGroup::Punctuation)); + /// // name has incorrect casing + /// assert_eq!(lookup.get_strict("letter"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("letter"), Some(GeneralCategoryGroup::Letter)); + /// // fake property + /// assert_eq!(lookup.get_strict("EverythingLol"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + } +} + +impl From<GeneralCategory> for GeneralCategoryGroup { + fn from(subcategory: GeneralCategory) -> Self { + GeneralCategoryGroup(1 << (subcategory as u32)) + } +} +impl From<u32> for GeneralCategoryGroup { + fn from(mask: u32) -> Self { + // Mask off things not in Self::ALL to guarantee the mask + // values stay in-range + GeneralCategoryGroup(mask & Self::ALL) + } +} +impl From<GeneralCategoryGroup> for u32 { + fn from(group: GeneralCategoryGroup) -> Self { + group.0 + } +} +/// Enumerated property Script. +/// +/// This is used with both the Script and Script_Extensions Unicode properties. +/// Each character is assigned a single Script, but characters that are used in +/// a particular subset of scripts will be in more than one Script_Extensions set. +/// For example, DEVANAGARI DIGIT NINE has Script=Devanagari, but is also in the +/// Script_Extensions set for Dogra, Kaithi, and Mahajani. +/// +/// For more information, see UAX #24: <http://www.unicode.org/reports/tr24/>. +/// See `UScriptCode` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(ScriptULE)] +pub struct Script(pub u16); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl Script { + pub const Adlam: Script = Script(167); + pub const Ahom: Script = Script(161); + pub const AnatolianHieroglyphs: Script = Script(156); + pub const Arabic: Script = Script(2); + pub const Armenian: Script = Script(3); + pub const Avestan: Script = Script(117); + pub const Balinese: Script = Script(62); + pub const Bamum: Script = Script(130); + pub const BassaVah: Script = Script(134); + pub const Batak: Script = Script(63); + pub const Bengali: Script = Script(4); + pub const Bhaiksuki: Script = Script(168); + pub const Bopomofo: Script = Script(5); + pub const Brahmi: Script = Script(65); + pub const Braille: Script = Script(46); + pub const Buginese: Script = Script(55); + pub const Buhid: Script = Script(44); + pub const CanadianAboriginal: Script = Script(40); + pub const Carian: Script = Script(104); + pub const CaucasianAlbanian: Script = Script(159); + pub const Chakma: Script = Script(118); + pub const Cham: Script = Script(66); + pub const Cherokee: Script = Script(6); + pub const Chorasmian: Script = Script(189); + pub const Common: Script = Script(0); + pub const Coptic: Script = Script(7); + pub const Cuneiform: Script = Script(101); + pub const Cypriot: Script = Script(47); + pub const CyproMinoan: Script = Script(193); + pub const Cyrillic: Script = Script(8); + pub const Deseret: Script = Script(9); + pub const Devanagari: Script = Script(10); + pub const DivesAkuru: Script = Script(190); + pub const Dogra: Script = Script(178); + pub const Duployan: Script = Script(135); + pub const EgyptianHieroglyphs: Script = Script(71); + pub const Elbasan: Script = Script(136); + pub const Elymaic: Script = Script(185); + pub const Ethiopian: Script = Script(11); + pub const Georgian: Script = Script(12); + pub const Glagolitic: Script = Script(56); + pub const Gothic: Script = Script(13); + pub const Grantha: Script = Script(137); + pub const Greek: Script = Script(14); + pub const Gujarati: Script = Script(15); + pub const GunjalaGondi: Script = Script(179); + pub const Gurmukhi: Script = Script(16); + pub const Han: Script = Script(17); + pub const Hangul: Script = Script(18); + pub const HanifiRohingya: Script = Script(182); + pub const Hanunoo: Script = Script(43); + pub const Hatran: Script = Script(162); + pub const Hebrew: Script = Script(19); + pub const Hiragana: Script = Script(20); + pub const ImperialAramaic: Script = Script(116); + pub const Inherited: Script = Script(1); + pub const InscriptionalPahlavi: Script = Script(122); + pub const InscriptionalParthian: Script = Script(125); + pub const Javanese: Script = Script(78); + pub const Kaithi: Script = Script(120); + pub const Kannada: Script = Script(21); + pub const Katakana: Script = Script(22); + pub const Kawi: Script = Script(198); + pub const KayahLi: Script = Script(79); + pub const Kharoshthi: Script = Script(57); + pub const KhitanSmallScript: Script = Script(191); + pub const Khmer: Script = Script(23); + pub const Khojki: Script = Script(157); + pub const Khudawadi: Script = Script(145); + pub const Lao: Script = Script(24); + pub const Latin: Script = Script(25); + pub const Lepcha: Script = Script(82); + pub const Limbu: Script = Script(48); + pub const LinearA: Script = Script(83); + pub const LinearB: Script = Script(49); + pub const Lisu: Script = Script(131); + pub const Lycian: Script = Script(107); + pub const Lydian: Script = Script(108); + pub const Mahajani: Script = Script(160); + pub const Makasar: Script = Script(180); + pub const Malayalam: Script = Script(26); + pub const Mandaic: Script = Script(84); + pub const Manichaean: Script = Script(121); + pub const Marchen: Script = Script(169); + pub const MasaramGondi: Script = Script(175); + pub const Medefaidrin: Script = Script(181); + pub const MeeteiMayek: Script = Script(115); + pub const MendeKikakui: Script = Script(140); + pub const MeroiticCursive: Script = Script(141); + pub const MeroiticHieroglyphs: Script = Script(86); + pub const Miao: Script = Script(92); + pub const Modi: Script = Script(163); + pub const Mongolian: Script = Script(27); + pub const Mro: Script = Script(149); + pub const Multani: Script = Script(164); + pub const Myanmar: Script = Script(28); + pub const Nabataean: Script = Script(143); + pub const NagMundari: Script = Script(199); + pub const Nandinagari: Script = Script(187); + pub const NewTaiLue: Script = Script(59); + pub const Newa: Script = Script(170); + pub const Nko: Script = Script(87); + pub const Nushu: Script = Script(150); + pub const NyiakengPuachueHmong: Script = Script(186); + pub const Ogham: Script = Script(29); + pub const OlChiki: Script = Script(109); + pub const OldHungarian: Script = Script(76); + pub const OldItalic: Script = Script(30); + pub const OldNorthArabian: Script = Script(142); + pub const OldPermic: Script = Script(89); + pub const OldPersian: Script = Script(61); + pub const OldSogdian: Script = Script(184); + pub const OldSouthArabian: Script = Script(133); + pub const OldTurkic: Script = Script(88); + pub const OldUyghur: Script = Script(194); + pub const Oriya: Script = Script(31); + pub const Osage: Script = Script(171); + pub const Osmanya: Script = Script(50); + pub const PahawhHmong: Script = Script(75); + pub const Palmyrene: Script = Script(144); + pub const PauCinHau: Script = Script(165); + pub const PhagsPa: Script = Script(90); + pub const Phoenician: Script = Script(91); + pub const PsalterPahlavi: Script = Script(123); + pub const Rejang: Script = Script(110); + pub const Runic: Script = Script(32); + pub const Samaritan: Script = Script(126); + pub const Saurashtra: Script = Script(111); + pub const Sharada: Script = Script(151); + pub const Shavian: Script = Script(51); + pub const Siddham: Script = Script(166); + pub const SignWriting: Script = Script(112); + pub const Sinhala: Script = Script(33); + pub const Sogdian: Script = Script(183); + pub const SoraSompeng: Script = Script(152); + pub const Soyombo: Script = Script(176); + pub const Sundanese: Script = Script(113); + pub const SylotiNagri: Script = Script(58); + pub const Syriac: Script = Script(34); + pub const Tagalog: Script = Script(42); + pub const Tagbanwa: Script = Script(45); + pub const TaiLe: Script = Script(52); + pub const TaiTham: Script = Script(106); + pub const TaiViet: Script = Script(127); + pub const Takri: Script = Script(153); + pub const Tamil: Script = Script(35); + pub const Tangsa: Script = Script(195); + pub const Tangut: Script = Script(154); + pub const Telugu: Script = Script(36); + pub const Thaana: Script = Script(37); + pub const Thai: Script = Script(38); + pub const Tibetan: Script = Script(39); + pub const Tifinagh: Script = Script(60); + pub const Tirhuta: Script = Script(158); + pub const Toto: Script = Script(196); + pub const Ugaritic: Script = Script(53); + pub const Unknown: Script = Script(103); + pub const Vai: Script = Script(99); + pub const Vithkuqi: Script = Script(197); + pub const Wancho: Script = Script(188); + pub const WarangCiti: Script = Script(146); + pub const Yezidi: Script = Script(192); + pub const Yi: Script = Script(41); + pub const ZanabazarSquare: Script = Script(177); +} + +impl_value_getter! { + markers: ScriptNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_SC_V1, ScriptValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR4_SC_V1, ScriptValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_SC_V1; + impl Script { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Script` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::Script; + /// + /// let lookup = Script::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("Brah"), Some(Script::Brahmi)); + /// assert_eq!(lookup.get_strict("Hang"), Some(Script::Hangul)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Brahmi"), Some(Script::Brahmi)); + /// assert_eq!(lookup.get_strict("Hangul"), Some(Script::Hangul)); + /// // name has incorrect casing + /// assert_eq!(lookup.get_strict("brahmi"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("brahmi"), Some(Script::Brahmi)); + /// // fake property + /// assert_eq!(lookup.get_strict("Linear_Z"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Script` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::Script; + /// use tinystr::tinystr; + /// + /// let lookup = Script::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(Script::Brahmi), Some(tinystr!(4, "Brah"))); + /// assert_eq!(lookup.get(Script::Hangul), Some(tinystr!(4, "Hang"))); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearTiny4Mapper / PropertyEnumToValueNameLinearTiny4MapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearTiny4Mapper`], capable of looking up long names + /// for values of the `Script` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::Script; + /// + /// let lookup = Script::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(Script::Brahmi), Some("Brahmi")); + /// assert_eq!(lookup.get(Script::Hangul), Some("Hangul")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +/// Enumerated property East_Asian_Width. +/// +/// See "Definition" in UAX #11 for the summary of each property value: +/// <https://www.unicode.org/reports/tr11/#Definitions> +/// +/// The numeric value is compatible with `UEastAsianWidth` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(EastAsianWidthULE)] +pub struct EastAsianWidth(pub u8); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl EastAsianWidth { + pub const Neutral: EastAsianWidth = EastAsianWidth(0); //name="N" + pub const Ambiguous: EastAsianWidth = EastAsianWidth(1); //name="A" + pub const Halfwidth: EastAsianWidth = EastAsianWidth(2); //name="H" + pub const Fullwidth: EastAsianWidth = EastAsianWidth(3); //name="F" + pub const Narrow: EastAsianWidth = EastAsianWidth(4); //name="Na" + pub const Wide: EastAsianWidth = EastAsianWidth(5); //name="W" +} + +impl_value_getter! { + markers: EastAsianWidthNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_EA_V1, EastAsianWidthValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_EA_V1, EastAsianWidthValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_EA_V1; + impl EastAsianWidth { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `East_Asian_Width` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::EastAsianWidth; + /// + /// let lookup = EastAsianWidth::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("N"), Some(EastAsianWidth::Neutral)); + /// assert_eq!(lookup.get_strict("H"), Some(EastAsianWidth::Halfwidth)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Neutral"), Some(EastAsianWidth::Neutral)); + /// assert_eq!(lookup.get_strict("Halfwidth"), Some(EastAsianWidth::Halfwidth)); + /// // name has incorrect casing / extra hyphen + /// assert_eq!(lookup.get_strict("half-width"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("half-width"), Some(EastAsianWidth::Halfwidth)); + /// // fake property + /// assert_eq!(lookup.get_strict("TwoPointFiveWidth"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `East_Asian_Width` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::EastAsianWidth; + /// + /// let lookup = EastAsianWidth::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(EastAsianWidth::Neutral), Some("N")); + /// assert_eq!(lookup.get(EastAsianWidth::Halfwidth), Some("H")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `East_Asian_Width` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::EastAsianWidth; + /// + /// let lookup = EastAsianWidth::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(EastAsianWidth::Neutral), Some("Neutral")); + /// assert_eq!(lookup.get(EastAsianWidth::Halfwidth), Some("Halfwidth")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +/// Enumerated property Line_Break. +/// +/// See "Line Breaking Properties" in UAX #14 for the summary of each property +/// value: <https://www.unicode.org/reports/tr14/#Properties> +/// +/// The numeric value is compatible with `ULineBreak` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(LineBreakULE)] +pub struct LineBreak(pub u8); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl LineBreak { + pub const Unknown: LineBreak = LineBreak(0); // name="XX" + pub const Ambiguous: LineBreak = LineBreak(1); // name="AI" + pub const Alphabetic: LineBreak = LineBreak(2); // name="AL" + pub const BreakBoth: LineBreak = LineBreak(3); // name="B2" + pub const BreakAfter: LineBreak = LineBreak(4); // name="BA" + pub const BreakBefore: LineBreak = LineBreak(5); // name="BB" + pub const MandatoryBreak: LineBreak = LineBreak(6); // name="BK" + pub const ContingentBreak: LineBreak = LineBreak(7); // name="CB" + pub const ClosePunctuation: LineBreak = LineBreak(8); // name="CL" + pub const CombiningMark: LineBreak = LineBreak(9); // name="CM" + pub const CarriageReturn: LineBreak = LineBreak(10); // name="CR" + pub const Exclamation: LineBreak = LineBreak(11); // name="EX" + pub const Glue: LineBreak = LineBreak(12); // name="GL" + pub const Hyphen: LineBreak = LineBreak(13); // name="HY" + pub const Ideographic: LineBreak = LineBreak(14); // name="ID" + pub const Inseparable: LineBreak = LineBreak(15); // name="IN" + pub const InfixNumeric: LineBreak = LineBreak(16); // name="IS" + pub const LineFeed: LineBreak = LineBreak(17); // name="LF" + pub const Nonstarter: LineBreak = LineBreak(18); // name="NS" + pub const Numeric: LineBreak = LineBreak(19); // name="NU" + pub const OpenPunctuation: LineBreak = LineBreak(20); // name="OP" + pub const PostfixNumeric: LineBreak = LineBreak(21); // name="PO" + pub const PrefixNumeric: LineBreak = LineBreak(22); // name="PR" + pub const Quotation: LineBreak = LineBreak(23); // name="QU" + pub const ComplexContext: LineBreak = LineBreak(24); // name="SA" + pub const Surrogate: LineBreak = LineBreak(25); // name="SG" + pub const Space: LineBreak = LineBreak(26); // name="SP" + pub const BreakSymbols: LineBreak = LineBreak(27); // name="SY" + pub const ZWSpace: LineBreak = LineBreak(28); // name="ZW" + pub const NextLine: LineBreak = LineBreak(29); // name="NL" + pub const WordJoiner: LineBreak = LineBreak(30); // name="WJ" + pub const H2: LineBreak = LineBreak(31); // name="H2" + pub const H3: LineBreak = LineBreak(32); // name="H3" + pub const JL: LineBreak = LineBreak(33); // name="JL" + pub const JT: LineBreak = LineBreak(34); // name="JT" + pub const JV: LineBreak = LineBreak(35); // name="JV" + pub const CloseParenthesis: LineBreak = LineBreak(36); // name="CP" + pub const ConditionalJapaneseStarter: LineBreak = LineBreak(37); // name="CJ" + pub const HebrewLetter: LineBreak = LineBreak(38); // name="HL" + pub const RegionalIndicator: LineBreak = LineBreak(39); // name="RI" + pub const EBase: LineBreak = LineBreak(40); // name="EB" + pub const EModifier: LineBreak = LineBreak(41); // name="EM" + pub const ZWJ: LineBreak = LineBreak(42); // name="ZWJ" + + // Added in ICU 74: + pub const Aksara: LineBreak = LineBreak(43); // name="AK" + pub const AksaraPrebase: LineBreak = LineBreak(44); // name=AP" + pub const AksaraStart: LineBreak = LineBreak(45); // name=AS" + pub const ViramaFinal: LineBreak = LineBreak(46); // name=VF" + pub const Virama: LineBreak = LineBreak(47); // name=VI" +} + +impl_value_getter! { + markers: LineBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_LB_V1, LineBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_LB_V1, LineBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_LB_V1; + impl LineBreak { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Line_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::LineBreak; + /// + /// let lookup = LineBreak::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("BK"), Some(LineBreak::MandatoryBreak)); + /// assert_eq!(lookup.get_strict("AL"), Some(LineBreak::Alphabetic)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Mandatory_Break"), Some(LineBreak::MandatoryBreak)); + /// assert_eq!(lookup.get_strict("Alphabetic"), Some(LineBreak::Alphabetic)); + /// // name has incorrect casing and dash instead of underscore + /// assert_eq!(lookup.get_strict("mandatory-Break"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("mandatory-Break"), Some(LineBreak::MandatoryBreak)); + /// // fake property + /// assert_eq!(lookup.get_strict("Stochastic_Break"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Line_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::LineBreak; + /// + /// let lookup = LineBreak::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(LineBreak::MandatoryBreak), Some("BK")); + /// assert_eq!(lookup.get(LineBreak::Alphabetic), Some("AL")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `Line_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::LineBreak; + /// + /// let lookup = LineBreak::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(LineBreak::MandatoryBreak), Some("Mandatory_Break")); + /// assert_eq!(lookup.get(LineBreak::Alphabetic), Some("Alphabetic")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +/// Enumerated property Grapheme_Cluster_Break. +/// +/// See "Default Grapheme Cluster Boundary Specification" in UAX #29 for the +/// summary of each property value: +/// <https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table> +/// +/// The numeric value is compatible with `UGraphemeClusterBreak` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // this type is stable +#[repr(transparent)] +#[zerovec::make_ule(GraphemeClusterBreakULE)] +pub struct GraphemeClusterBreak(pub u8); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl GraphemeClusterBreak { + pub const Other: GraphemeClusterBreak = GraphemeClusterBreak(0); // name="XX" + pub const Control: GraphemeClusterBreak = GraphemeClusterBreak(1); // name="CN" + pub const CR: GraphemeClusterBreak = GraphemeClusterBreak(2); // name="CR" + pub const Extend: GraphemeClusterBreak = GraphemeClusterBreak(3); // name="EX" + pub const L: GraphemeClusterBreak = GraphemeClusterBreak(4); // name="L" + pub const LF: GraphemeClusterBreak = GraphemeClusterBreak(5); // name="LF" + pub const LV: GraphemeClusterBreak = GraphemeClusterBreak(6); // name="LV" + pub const LVT: GraphemeClusterBreak = GraphemeClusterBreak(7); // name="LVT" + pub const T: GraphemeClusterBreak = GraphemeClusterBreak(8); // name="T" + pub const V: GraphemeClusterBreak = GraphemeClusterBreak(9); // name="V" + pub const SpacingMark: GraphemeClusterBreak = GraphemeClusterBreak(10); // name="SM" + pub const Prepend: GraphemeClusterBreak = GraphemeClusterBreak(11); // name="PP" + pub const RegionalIndicator: GraphemeClusterBreak = GraphemeClusterBreak(12); // name="RI" + /// This value is obsolete and unused. + pub const EBase: GraphemeClusterBreak = GraphemeClusterBreak(13); // name="EB" + /// This value is obsolete and unused. + pub const EBaseGAZ: GraphemeClusterBreak = GraphemeClusterBreak(14); // name="EBG" + /// This value is obsolete and unused. + pub const EModifier: GraphemeClusterBreak = GraphemeClusterBreak(15); // name="EM" + /// This value is obsolete and unused. + pub const GlueAfterZwj: GraphemeClusterBreak = GraphemeClusterBreak(16); // name="GAZ" + pub const ZWJ: GraphemeClusterBreak = GraphemeClusterBreak(17); // name="ZWJ" +} + +impl_value_getter! { + markers: GraphemeClusterBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_GCB_V1, GraphemeClusterBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_GCB_V1, GraphemeClusterBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_GCB_V1; + impl GraphemeClusterBreak { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Grapheme_Cluster_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GraphemeClusterBreak; + /// + /// let lookup = GraphemeClusterBreak::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("EX"), Some(GraphemeClusterBreak::Extend)); + /// assert_eq!(lookup.get_strict("RI"), Some(GraphemeClusterBreak::RegionalIndicator)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Extend"), Some(GraphemeClusterBreak::Extend)); + /// assert_eq!(lookup.get_strict("Regional_Indicator"), Some(GraphemeClusterBreak::RegionalIndicator)); + /// // name has incorrect casing and lacks an underscore + /// assert_eq!(lookup.get_strict("regionalindicator"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("regionalindicator"), Some(GraphemeClusterBreak::RegionalIndicator)); + /// // fake property + /// assert_eq!(lookup.get_strict("Regional_Indicator_Two_Point_Oh"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Grapheme_Cluster_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GraphemeClusterBreak; + /// + /// let lookup = GraphemeClusterBreak::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(GraphemeClusterBreak::Extend), Some("EX")); + /// assert_eq!(lookup.get(GraphemeClusterBreak::RegionalIndicator), Some("RI")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `Grapheme_Cluster_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::GraphemeClusterBreak; + /// + /// let lookup = GraphemeClusterBreak::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(GraphemeClusterBreak::Extend), Some("Extend")); + /// assert_eq!(lookup.get(GraphemeClusterBreak::RegionalIndicator), Some("Regional_Indicator")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +/// Enumerated property Word_Break. +/// +/// See "Default Word Boundary Specification" in UAX #29 for the summary of +/// each property value: +/// <https://www.unicode.org/reports/tr29/#Default_Word_Boundaries>. +/// +/// The numeric value is compatible with `UWordBreakValues` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(WordBreakULE)] +pub struct WordBreak(pub u8); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl WordBreak { + pub const Other: WordBreak = WordBreak(0); // name="XX" + pub const ALetter: WordBreak = WordBreak(1); // name="LE" + pub const Format: WordBreak = WordBreak(2); // name="FO" + pub const Katakana: WordBreak = WordBreak(3); // name="KA" + pub const MidLetter: WordBreak = WordBreak(4); // name="ML" + pub const MidNum: WordBreak = WordBreak(5); // name="MN" + pub const Numeric: WordBreak = WordBreak(6); // name="NU" + pub const ExtendNumLet: WordBreak = WordBreak(7); // name="EX" + pub const CR: WordBreak = WordBreak(8); // name="CR" + pub const Extend: WordBreak = WordBreak(9); // name="Extend" + pub const LF: WordBreak = WordBreak(10); // name="LF" + pub const MidNumLet: WordBreak = WordBreak(11); // name="MB" + pub const Newline: WordBreak = WordBreak(12); // name="NL" + pub const RegionalIndicator: WordBreak = WordBreak(13); // name="RI" + pub const HebrewLetter: WordBreak = WordBreak(14); // name="HL" + pub const SingleQuote: WordBreak = WordBreak(15); // name="SQ" + pub const DoubleQuote: WordBreak = WordBreak(16); // name=DQ + /// This value is obsolete and unused. + pub const EBase: WordBreak = WordBreak(17); // name="EB" + /// This value is obsolete and unused. + pub const EBaseGAZ: WordBreak = WordBreak(18); // name="EBG" + /// This value is obsolete and unused. + pub const EModifier: WordBreak = WordBreak(19); // name="EM" + /// This value is obsolete and unused. + pub const GlueAfterZwj: WordBreak = WordBreak(20); // name="GAZ" + pub const ZWJ: WordBreak = WordBreak(21); // name="ZWJ" + pub const WSegSpace: WordBreak = WordBreak(22); // name="WSegSpace" +} + +impl_value_getter! { + markers: WordBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_WB_V1, WordBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_WB_V1, WordBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_WB_V1; + impl WordBreak { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Word_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::WordBreak; + /// + /// let lookup = WordBreak::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("KA"), Some(WordBreak::Katakana)); + /// assert_eq!(lookup.get_strict("LE"), Some(WordBreak::ALetter)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Katakana"), Some(WordBreak::Katakana)); + /// assert_eq!(lookup.get_strict("ALetter"), Some(WordBreak::ALetter)); + /// // name has incorrect casing + /// assert_eq!(lookup.get_strict("Aletter"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("Aletter"), Some(WordBreak::ALetter)); + /// assert_eq!(lookup.get_loose("w_seg_space"), Some(WordBreak::WSegSpace)); + /// // fake property + /// assert_eq!(lookup.get_strict("Quadruple_Quote"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Word_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::WordBreak; + /// + /// let lookup = WordBreak::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(WordBreak::Katakana), Some("KA")); + /// assert_eq!(lookup.get(WordBreak::ALetter), Some("LE")); + /// assert_eq!(lookup.get(WordBreak::WSegSpace), Some("WSegSpace")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `Word_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::WordBreak; + /// + /// let lookup = WordBreak::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(WordBreak::Katakana), Some("Katakana")); + /// assert_eq!(lookup.get(WordBreak::ALetter), Some("ALetter")); + /// assert_eq!(lookup.get(WordBreak::WSegSpace), Some("WSegSpace")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} + +/// Enumerated property Sentence_Break. +/// See "Default Sentence Boundary Specification" in UAX #29 for the summary of +/// each property value: +/// <https://www.unicode.org/reports/tr29/#Default_Word_Boundaries>. +/// +/// The numeric value is compatible with `USentenceBreak` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(SentenceBreakULE)] +pub struct SentenceBreak(pub u8); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl SentenceBreak { + pub const Other: SentenceBreak = SentenceBreak(0); // name="XX" + pub const ATerm: SentenceBreak = SentenceBreak(1); // name="AT" + pub const Close: SentenceBreak = SentenceBreak(2); // name="CL" + pub const Format: SentenceBreak = SentenceBreak(3); // name="FO" + pub const Lower: SentenceBreak = SentenceBreak(4); // name="LO" + pub const Numeric: SentenceBreak = SentenceBreak(5); // name="NU" + pub const OLetter: SentenceBreak = SentenceBreak(6); // name="LE" + pub const Sep: SentenceBreak = SentenceBreak(7); // name="SE" + pub const Sp: SentenceBreak = SentenceBreak(8); // name="SP" + pub const STerm: SentenceBreak = SentenceBreak(9); // name="ST" + pub const Upper: SentenceBreak = SentenceBreak(10); // name="UP" + pub const CR: SentenceBreak = SentenceBreak(11); // name="CR" + pub const Extend: SentenceBreak = SentenceBreak(12); // name="EX" + pub const LF: SentenceBreak = SentenceBreak(13); // name="LF" + pub const SContinue: SentenceBreak = SentenceBreak(14); // name="SC" +} + +impl_value_getter! { + markers: SentenceBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_SB_V1, SentenceBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_SB_V1, SentenceBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_SB_V1; + impl SentenceBreak { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Sentence_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::SentenceBreak; + /// + /// let lookup = SentenceBreak::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("FO"), Some(SentenceBreak::Format)); + /// assert_eq!(lookup.get_strict("NU"), Some(SentenceBreak::Numeric)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Format"), Some(SentenceBreak::Format)); + /// assert_eq!(lookup.get_strict("Numeric"), Some(SentenceBreak::Numeric)); + /// // name has incorrect casing + /// assert_eq!(lookup.get_strict("fOrmat"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("fOrmat"), Some(SentenceBreak::Format)); + /// // fake property + /// assert_eq!(lookup.get_strict("Fixer_Upper"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Sentence_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::SentenceBreak; + /// + /// let lookup = SentenceBreak::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(SentenceBreak::Format), Some("FO")); + /// assert_eq!(lookup.get(SentenceBreak::Numeric), Some("NU")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `Sentence_Break` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::SentenceBreak; + /// + /// let lookup = SentenceBreak::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(SentenceBreak::Format), Some("Format")); + /// assert_eq!(lookup.get(SentenceBreak::Numeric), Some("Numeric")); + /// assert_eq!(lookup.get(SentenceBreak::SContinue), Some("SContinue")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} +/// Property Canonical_Combining_Class. +/// See UAX #15: +/// <https://www.unicode.org/reports/tr15/>. +/// +/// See `icu_normalizer::properties::CanonicalCombiningClassMap` for the API +/// to look up the Canonical_Combining_Class property by scalar value. +// +// NOTE: The Pernosco debugger has special knowledge +// of this struct. Please do not change the bit layout +// or the crate-module-qualified name of this struct +// without coordination. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(CanonicalCombiningClassULE)] +pub struct CanonicalCombiningClass(pub u8); + +// These constant names come from PropertyValueAliases.txt +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl CanonicalCombiningClass { + pub const NotReordered: CanonicalCombiningClass = CanonicalCombiningClass(0); // name="NR" + pub const Overlay: CanonicalCombiningClass = CanonicalCombiningClass(1); // name="OV" + pub const HanReading: CanonicalCombiningClass = CanonicalCombiningClass(6); // name="HANR" + pub const Nukta: CanonicalCombiningClass = CanonicalCombiningClass(7); // name="NK" + pub const KanaVoicing: CanonicalCombiningClass = CanonicalCombiningClass(8); // name="KV" + pub const Virama: CanonicalCombiningClass = CanonicalCombiningClass(9); // name="VR" + pub const CCC10: CanonicalCombiningClass = CanonicalCombiningClass(10); // name="CCC10" + pub const CCC11: CanonicalCombiningClass = CanonicalCombiningClass(11); // name="CCC11" + pub const CCC12: CanonicalCombiningClass = CanonicalCombiningClass(12); // name="CCC12" + pub const CCC13: CanonicalCombiningClass = CanonicalCombiningClass(13); // name="CCC13" + pub const CCC14: CanonicalCombiningClass = CanonicalCombiningClass(14); // name="CCC14" + pub const CCC15: CanonicalCombiningClass = CanonicalCombiningClass(15); // name="CCC15" + pub const CCC16: CanonicalCombiningClass = CanonicalCombiningClass(16); // name="CCC16" + pub const CCC17: CanonicalCombiningClass = CanonicalCombiningClass(17); // name="CCC17" + pub const CCC18: CanonicalCombiningClass = CanonicalCombiningClass(18); // name="CCC18" + pub const CCC19: CanonicalCombiningClass = CanonicalCombiningClass(19); // name="CCC19" + pub const CCC20: CanonicalCombiningClass = CanonicalCombiningClass(20); // name="CCC20" + pub const CCC21: CanonicalCombiningClass = CanonicalCombiningClass(21); // name="CCC21" + pub const CCC22: CanonicalCombiningClass = CanonicalCombiningClass(22); // name="CCC22" + pub const CCC23: CanonicalCombiningClass = CanonicalCombiningClass(23); // name="CCC23" + pub const CCC24: CanonicalCombiningClass = CanonicalCombiningClass(24); // name="CCC24" + pub const CCC25: CanonicalCombiningClass = CanonicalCombiningClass(25); // name="CCC25" + pub const CCC26: CanonicalCombiningClass = CanonicalCombiningClass(26); // name="CCC26" + pub const CCC27: CanonicalCombiningClass = CanonicalCombiningClass(27); // name="CCC27" + pub const CCC28: CanonicalCombiningClass = CanonicalCombiningClass(28); // name="CCC28" + pub const CCC29: CanonicalCombiningClass = CanonicalCombiningClass(29); // name="CCC29" + pub const CCC30: CanonicalCombiningClass = CanonicalCombiningClass(30); // name="CCC30" + pub const CCC31: CanonicalCombiningClass = CanonicalCombiningClass(31); // name="CCC31" + pub const CCC32: CanonicalCombiningClass = CanonicalCombiningClass(32); // name="CCC32" + pub const CCC33: CanonicalCombiningClass = CanonicalCombiningClass(33); // name="CCC33" + pub const CCC34: CanonicalCombiningClass = CanonicalCombiningClass(34); // name="CCC34" + pub const CCC35: CanonicalCombiningClass = CanonicalCombiningClass(35); // name="CCC35" + pub const CCC36: CanonicalCombiningClass = CanonicalCombiningClass(36); // name="CCC36" + pub const CCC84: CanonicalCombiningClass = CanonicalCombiningClass(84); // name="CCC84" + pub const CCC91: CanonicalCombiningClass = CanonicalCombiningClass(91); // name="CCC91" + pub const CCC103: CanonicalCombiningClass = CanonicalCombiningClass(103); // name="CCC103" + pub const CCC107: CanonicalCombiningClass = CanonicalCombiningClass(107); // name="CCC107" + pub const CCC118: CanonicalCombiningClass = CanonicalCombiningClass(118); // name="CCC118" + pub const CCC122: CanonicalCombiningClass = CanonicalCombiningClass(122); // name="CCC122" + pub const CCC129: CanonicalCombiningClass = CanonicalCombiningClass(129); // name="CCC129" + pub const CCC130: CanonicalCombiningClass = CanonicalCombiningClass(130); // name="CCC130" + pub const CCC132: CanonicalCombiningClass = CanonicalCombiningClass(132); // name="CCC132" + pub const CCC133: CanonicalCombiningClass = CanonicalCombiningClass(133); // name="CCC133" // RESERVED + pub const AttachedBelowLeft: CanonicalCombiningClass = CanonicalCombiningClass(200); // name="ATBL" + pub const AttachedBelow: CanonicalCombiningClass = CanonicalCombiningClass(202); // name="ATB" + pub const AttachedAbove: CanonicalCombiningClass = CanonicalCombiningClass(214); // name="ATA" + pub const AttachedAboveRight: CanonicalCombiningClass = CanonicalCombiningClass(216); // name="ATAR" + pub const BelowLeft: CanonicalCombiningClass = CanonicalCombiningClass(218); // name="BL" + pub const Below: CanonicalCombiningClass = CanonicalCombiningClass(220); // name="B" + pub const BelowRight: CanonicalCombiningClass = CanonicalCombiningClass(222); // name="BR" + pub const Left: CanonicalCombiningClass = CanonicalCombiningClass(224); // name="L" + pub const Right: CanonicalCombiningClass = CanonicalCombiningClass(226); // name="R" + pub const AboveLeft: CanonicalCombiningClass = CanonicalCombiningClass(228); // name="AL" + pub const Above: CanonicalCombiningClass = CanonicalCombiningClass(230); // name="A" + pub const AboveRight: CanonicalCombiningClass = CanonicalCombiningClass(232); // name="AR" + pub const DoubleBelow: CanonicalCombiningClass = CanonicalCombiningClass(233); // name="DB" + pub const DoubleAbove: CanonicalCombiningClass = CanonicalCombiningClass(234); // name="DA" + pub const IotaSubscript: CanonicalCombiningClass = CanonicalCombiningClass(240); // name="IS" +} + +impl_value_getter! { + markers: CanonicalCombiningClassNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_CCC_V1, CanonicalCombiningClassValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_SPARSE_CCC_V1, CanonicalCombiningClassValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_SPARSE_CCC_V1; + impl CanonicalCombiningClass { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Canonical_Combining_Class` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::CanonicalCombiningClass; + /// + /// let lookup = CanonicalCombiningClass::name_to_enum_mapper(); + /// // short name for value + /// assert_eq!(lookup.get_strict("AL"), Some(CanonicalCombiningClass::AboveLeft)); + /// assert_eq!(lookup.get_strict("ATBL"), Some(CanonicalCombiningClass::AttachedBelowLeft)); + /// assert_eq!(lookup.get_strict("CCC10"), Some(CanonicalCombiningClass::CCC10)); + /// // long name for value + /// assert_eq!(lookup.get_strict("Above_Left"), Some(CanonicalCombiningClass::AboveLeft)); + /// assert_eq!(lookup.get_strict("Attached_Below_Left"), Some(CanonicalCombiningClass::AttachedBelowLeft)); + /// // name has incorrect casing and hyphens + /// assert_eq!(lookup.get_strict("attached-below-left"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("attached-below-left"), Some(CanonicalCombiningClass::AttachedBelowLeft)); + /// // fake property + /// assert_eq!(lookup.get_strict("Linear_Z"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameSparseMapper`], capable of looking up short names + /// for values of the `Canonical_Combining_Class` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::CanonicalCombiningClass; + /// + /// let lookup = CanonicalCombiningClass::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(CanonicalCombiningClass::AboveLeft), Some("AL")); + /// assert_eq!(lookup.get(CanonicalCombiningClass::AttachedBelowLeft), Some("ATBL")); + /// assert_eq!(lookup.get(CanonicalCombiningClass::CCC10), Some("CCC10")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameSparseMapper / PropertyEnumToValueNameSparseMapperBorrowed; + /// Return a [`PropertyEnumToValueNameSparseMapper`], capable of looking up long names + /// for values of the `Canonical_Combining_Class` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::CanonicalCombiningClass; + /// + /// let lookup = CanonicalCombiningClass::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(CanonicalCombiningClass::AboveLeft), Some("Above_Left")); + /// assert_eq!(lookup.get(CanonicalCombiningClass::AttachedBelowLeft), Some("Attached_Below_Left")); + /// assert_eq!(lookup.get(CanonicalCombiningClass::CCC10), Some("CCC10")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameSparseMapper / PropertyEnumToValueNameSparseMapperBorrowed; + } +} + +/// Property Indic_Syllabic_Category. +/// See UAX #44: +/// <https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category>. +/// +/// The numeric value is compatible with `UIndicSyllabicCategory` in ICU4C. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties))] +#[allow(clippy::exhaustive_structs)] // newtype +#[repr(transparent)] +#[zerovec::make_ule(IndicSyllabicCategoryULE)] +pub struct IndicSyllabicCategory(pub u8); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl IndicSyllabicCategory { + pub const Other: IndicSyllabicCategory = IndicSyllabicCategory(0); + pub const Avagraha: IndicSyllabicCategory = IndicSyllabicCategory(1); + pub const Bindu: IndicSyllabicCategory = IndicSyllabicCategory(2); + pub const BrahmiJoiningNumber: IndicSyllabicCategory = IndicSyllabicCategory(3); + pub const CantillationMark: IndicSyllabicCategory = IndicSyllabicCategory(4); + pub const Consonant: IndicSyllabicCategory = IndicSyllabicCategory(5); + pub const ConsonantDead: IndicSyllabicCategory = IndicSyllabicCategory(6); + pub const ConsonantFinal: IndicSyllabicCategory = IndicSyllabicCategory(7); + pub const ConsonantHeadLetter: IndicSyllabicCategory = IndicSyllabicCategory(8); + pub const ConsonantInitialPostfixed: IndicSyllabicCategory = IndicSyllabicCategory(9); + pub const ConsonantKiller: IndicSyllabicCategory = IndicSyllabicCategory(10); + pub const ConsonantMedial: IndicSyllabicCategory = IndicSyllabicCategory(11); + pub const ConsonantPlaceholder: IndicSyllabicCategory = IndicSyllabicCategory(12); + pub const ConsonantPrecedingRepha: IndicSyllabicCategory = IndicSyllabicCategory(13); + pub const ConsonantPrefixed: IndicSyllabicCategory = IndicSyllabicCategory(14); + pub const ConsonantSucceedingRepha: IndicSyllabicCategory = IndicSyllabicCategory(15); + pub const ConsonantSubjoined: IndicSyllabicCategory = IndicSyllabicCategory(16); + pub const ConsonantWithStacker: IndicSyllabicCategory = IndicSyllabicCategory(17); + pub const GeminationMark: IndicSyllabicCategory = IndicSyllabicCategory(18); + pub const InvisibleStacker: IndicSyllabicCategory = IndicSyllabicCategory(19); + pub const Joiner: IndicSyllabicCategory = IndicSyllabicCategory(20); + pub const ModifyingLetter: IndicSyllabicCategory = IndicSyllabicCategory(21); + pub const NonJoiner: IndicSyllabicCategory = IndicSyllabicCategory(22); + pub const Nukta: IndicSyllabicCategory = IndicSyllabicCategory(23); + pub const Number: IndicSyllabicCategory = IndicSyllabicCategory(24); + pub const NumberJoiner: IndicSyllabicCategory = IndicSyllabicCategory(25); + pub const PureKiller: IndicSyllabicCategory = IndicSyllabicCategory(26); + pub const RegisterShifter: IndicSyllabicCategory = IndicSyllabicCategory(27); + pub const SyllableModifier: IndicSyllabicCategory = IndicSyllabicCategory(28); + pub const ToneLetter: IndicSyllabicCategory = IndicSyllabicCategory(29); + pub const ToneMark: IndicSyllabicCategory = IndicSyllabicCategory(30); + pub const Virama: IndicSyllabicCategory = IndicSyllabicCategory(31); + pub const Visarga: IndicSyllabicCategory = IndicSyllabicCategory(32); + pub const Vowel: IndicSyllabicCategory = IndicSyllabicCategory(33); + pub const VowelDependent: IndicSyllabicCategory = IndicSyllabicCategory(34); + pub const VowelIndependent: IndicSyllabicCategory = IndicSyllabicCategory(35); +} + +impl_value_getter! { + markers: IndicSyllabicCategoryNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_INSC_V1, IndicSyllabicCategoryValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_INSC_V1, IndicSyllabicCategoryValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_INSC_V1; + impl IndicSyllabicCategory { + /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values + /// from strings for the `Indic_Syllabic_Category` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::IndicSyllabicCategory; + /// + /// let lookup = IndicSyllabicCategory::name_to_enum_mapper(); + /// // long/short name for value + /// assert_eq!(lookup.get_strict("Brahmi_Joining_Number"), Some(IndicSyllabicCategory::BrahmiJoiningNumber)); + /// assert_eq!(lookup.get_strict("Vowel_Independent"), Some(IndicSyllabicCategory::VowelIndependent)); + /// // name has incorrect casing and hyphens + /// assert_eq!(lookup.get_strict("brahmi-joining-number"), None); + /// // loose matching of name + /// assert_eq!(lookup.get_loose("brahmi-joining-number"), Some(IndicSyllabicCategory::BrahmiJoiningNumber)); + /// // fake property + /// assert_eq!(lookup.get_strict("Tone_Number"), None); + /// ``` + pub fn get_name_to_enum_mapper() / name_to_enum_mapper(); + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names + /// for values of the `Indic_Syllabic_Category` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::IndicSyllabicCategory; + /// + /// let lookup = IndicSyllabicCategory::enum_to_short_name_mapper(); + /// assert_eq!(lookup.get(IndicSyllabicCategory::BrahmiJoiningNumber), Some("Brahmi_Joining_Number")); + /// assert_eq!(lookup.get(IndicSyllabicCategory::VowelIndependent), Some("Vowel_Independent")); + /// ``` + pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names + /// for values of the `Indic_Syllabic_Category` enumerated property. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu::properties::IndicSyllabicCategory; + /// + /// let lookup = IndicSyllabicCategory::enum_to_long_name_mapper(); + /// assert_eq!(lookup.get(IndicSyllabicCategory::BrahmiJoiningNumber), Some("Brahmi_Joining_Number")); + /// assert_eq!(lookup.get(IndicSyllabicCategory::VowelIndependent), Some("Vowel_Independent")); + /// ``` + pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed; + } +} diff --git a/third_party/rust/icu_properties/src/provider.rs b/third_party/rust/icu_properties/src/provider.rs new file mode 100644 index 0000000000..53fb2d5fd7 --- /dev/null +++ b/third_party/rust/icu_properties/src/provider.rs @@ -0,0 +1,900 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// Provider structs must be stable +#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] + +//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. +//! +//! <div class="stab unstable"> +//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +//! including in SemVer minor releases. While the serde representation of data structs is guaranteed +//! to be stable, their Rust representation might not be. Use with caution. +//! </div> +//! +//! Read more about data providers: [`icu_provider`] + +pub mod names; + +use crate::script::ScriptWithExt; +use crate::Script; + +use core::ops::RangeInclusive; +use core::str; +use icu_collections::codepointinvlist::CodePointInversionList; +use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList; +use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue}; +use icu_provider::prelude::*; +use icu_provider::{DataKeyMetadata, FallbackPriority}; +use zerofrom::ZeroFrom; + +use zerovec::{VarZeroVec, ZeroSlice, ZeroVecError}; + +#[cfg(feature = "compiled_data")] +#[derive(Debug)] +/// Baked data +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only +/// guaranteed to match with this version's `*_unstable` providers. Use with caution. +/// </div> +pub struct Baked; + +#[cfg(feature = "compiled_data")] +const _: () = { + pub mod icu { + pub use crate as properties; + pub use icu_collections as collections; + pub use icu_locid_transform as locid_transform; + } + icu_properties_data::make_provider!(Baked); + icu_properties_data::impl_propnames_from_gcb_v1!(Baked); + icu_properties_data::impl_propnames_from_bc_v1!(Baked); + icu_properties_data::impl_propnames_from_ccc_v1!(Baked); + icu_properties_data::impl_propnames_from_ea_v1!(Baked); + icu_properties_data::impl_propnames_from_gc_v1!(Baked); + icu_properties_data::impl_propnames_from_gcm_v1!(Baked); + icu_properties_data::impl_propnames_from_insc_v1!(Baked); + icu_properties_data::impl_propnames_from_lb_v1!(Baked); + icu_properties_data::impl_propnames_from_sb_v1!(Baked); + icu_properties_data::impl_propnames_from_sc_v1!(Baked); + icu_properties_data::impl_propnames_from_wb_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_bc_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_ea_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_gc_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_gcb_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_insc_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_lb_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_sb_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_sc_v1!(Baked); + icu_properties_data::impl_propnames_to_long_linear_wb_v1!(Baked); + icu_properties_data::impl_propnames_to_long_sparse_ccc_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_bc_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_ea_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_gc_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_gcb_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_insc_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_lb_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_sb_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear_wb_v1!(Baked); + icu_properties_data::impl_propnames_to_short_linear4_sc_v1!(Baked); + icu_properties_data::impl_propnames_to_short_sparse_ccc_v1!(Baked); + icu_properties_data::impl_props_ahex_v1!(Baked); + icu_properties_data::impl_props_alnum_v1!(Baked); + icu_properties_data::impl_props_alpha_v1!(Baked); + icu_properties_data::impl_props_basic_emoji_v1!(Baked); + icu_properties_data::impl_props_bc_v1!(Baked); + icu_properties_data::impl_props_bidi_c_v1!(Baked); + icu_properties_data::impl_props_bidi_m_v1!(Baked); + icu_properties_data::impl_props_bidiauxiliaryprops_v1!(Baked); + icu_properties_data::impl_props_blank_v1!(Baked); + icu_properties_data::impl_props_cased_v1!(Baked); + icu_properties_data::impl_props_ccc_v1!(Baked); + icu_properties_data::impl_props_ci_v1!(Baked); + icu_properties_data::impl_props_comp_ex_v1!(Baked); + icu_properties_data::impl_props_cwcf_v1!(Baked); + icu_properties_data::impl_props_cwcm_v1!(Baked); + icu_properties_data::impl_props_cwkcf_v1!(Baked); + icu_properties_data::impl_props_cwl_v1!(Baked); + icu_properties_data::impl_props_cwt_v1!(Baked); + icu_properties_data::impl_props_cwu_v1!(Baked); + icu_properties_data::impl_props_dash_v1!(Baked); + icu_properties_data::impl_props_dep_v1!(Baked); + icu_properties_data::impl_props_di_v1!(Baked); + icu_properties_data::impl_props_dia_v1!(Baked); + icu_properties_data::impl_props_ea_v1!(Baked); + icu_properties_data::impl_props_ebase_v1!(Baked); + icu_properties_data::impl_props_ecomp_v1!(Baked); + icu_properties_data::impl_props_emod_v1!(Baked); + icu_properties_data::impl_props_emoji_v1!(Baked); + icu_properties_data::impl_props_epres_v1!(Baked); + icu_properties_data::impl_props_exemplarchars_auxiliary_v1!(Baked); + icu_properties_data::impl_props_exemplarchars_index_v1!(Baked); + icu_properties_data::impl_props_exemplarchars_main_v1!(Baked); + icu_properties_data::impl_props_exemplarchars_numbers_v1!(Baked); + icu_properties_data::impl_props_exemplarchars_punctuation_v1!(Baked); + icu_properties_data::impl_props_ext_v1!(Baked); + icu_properties_data::impl_props_extpict_v1!(Baked); + icu_properties_data::impl_props_gc_v1!(Baked); + icu_properties_data::impl_props_gcb_v1!(Baked); + icu_properties_data::impl_props_gr_base_v1!(Baked); + icu_properties_data::impl_props_gr_ext_v1!(Baked); + icu_properties_data::impl_props_gr_link_v1!(Baked); + icu_properties_data::impl_props_graph_v1!(Baked); + icu_properties_data::impl_props_hex_v1!(Baked); + icu_properties_data::impl_props_hyphen_v1!(Baked); + icu_properties_data::impl_props_idc_v1!(Baked); + icu_properties_data::impl_props_ideo_v1!(Baked); + icu_properties_data::impl_props_ids_v1!(Baked); + icu_properties_data::impl_props_idsb_v1!(Baked); + icu_properties_data::impl_props_idst_v1!(Baked); + icu_properties_data::impl_props_insc_v1!(Baked); + icu_properties_data::impl_props_join_c_v1!(Baked); + icu_properties_data::impl_props_lb_v1!(Baked); + icu_properties_data::impl_props_loe_v1!(Baked); + icu_properties_data::impl_props_lower_v1!(Baked); + icu_properties_data::impl_props_math_v1!(Baked); + icu_properties_data::impl_props_nchar_v1!(Baked); + icu_properties_data::impl_props_nfcinert_v1!(Baked); + icu_properties_data::impl_props_nfdinert_v1!(Baked); + icu_properties_data::impl_props_nfkcinert_v1!(Baked); + icu_properties_data::impl_props_nfkdinert_v1!(Baked); + icu_properties_data::impl_props_pat_syn_v1!(Baked); + icu_properties_data::impl_props_pat_ws_v1!(Baked); + icu_properties_data::impl_props_pcm_v1!(Baked); + icu_properties_data::impl_props_print_v1!(Baked); + icu_properties_data::impl_props_qmark_v1!(Baked); + icu_properties_data::impl_props_radical_v1!(Baked); + icu_properties_data::impl_props_ri_v1!(Baked); + icu_properties_data::impl_props_sb_v1!(Baked); + icu_properties_data::impl_props_sc_v1!(Baked); + icu_properties_data::impl_props_scx_v1!(Baked); + icu_properties_data::impl_props_sd_v1!(Baked); + icu_properties_data::impl_props_segstart_v1!(Baked); + icu_properties_data::impl_props_sensitive_v1!(Baked); + icu_properties_data::impl_props_sterm_v1!(Baked); + icu_properties_data::impl_props_term_v1!(Baked); + icu_properties_data::impl_props_uideo_v1!(Baked); + icu_properties_data::impl_props_upper_v1!(Baked); + icu_properties_data::impl_props_vs_v1!(Baked); + icu_properties_data::impl_props_wb_v1!(Baked); + icu_properties_data::impl_props_wspace_v1!(Baked); + icu_properties_data::impl_props_xdigit_v1!(Baked); + icu_properties_data::impl_props_xidc_v1!(Baked); + icu_properties_data::impl_props_xids_v1!(Baked); +}; + +// include the specialized structs for the compact representation of Bidi property data +pub mod bidi_data; + +/// A set of characters which share a particular property value. +/// +/// This data enum is extensible, more backends may be added in the future. +/// Old data can be used with newer code but not vice versa. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[non_exhaustive] +pub enum PropertyCodePointSetV1<'data> { + /// The set of characters, represented as an inversion list + InversionList(#[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionList<'data>), + // new variants should go BELOW existing ones + // Serde serializes based on variant name and index in the enum + // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant +} + +/// A map efficiently storing data about individual characters. +/// +/// This data enum is extensible, more backends may be added in the future. +/// Old data can be used with newer code but not vice versa. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Clone, Debug, Eq, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[non_exhaustive] +pub enum PropertyCodePointMapV1<'data, T: TrieValue> { + /// A codepoint trie storing the data + CodePointTrie(#[cfg_attr(feature = "serde", serde(borrow))] CodePointTrie<'data, T>), + // new variants should go BELOW existing ones + // Serde serializes based on variant name and index in the enum + // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant +} + +/// A set of characters and strings which share a particular property value. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[non_exhaustive] +pub enum PropertyUnicodeSetV1<'data> { + /// A set representing characters in an inversion list, and the strings in a list. + CPInversionListStrList( + #[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionListAndStringList<'data>, + ), + // new variants should go BELOW existing ones + // Serde serializes based on variant name and index in the enum + // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant +} + +impl<'data> PropertyUnicodeSetV1<'data> { + #[inline] + pub(crate) fn contains(&self, s: &str) -> bool { + match *self { + Self::CPInversionListStrList(ref l) => l.contains(s), + } + } + + #[inline] + pub(crate) fn contains32(&self, cp: u32) -> bool { + match *self { + Self::CPInversionListStrList(ref l) => l.contains32(cp), + } + } + + #[inline] + pub(crate) fn contains_char(&self, ch: char) -> bool { + match *self { + Self::CPInversionListStrList(ref l) => l.contains_char(ch), + } + } + + #[inline] + pub(crate) fn from_code_point_inversion_list_string_list( + l: CodePointInversionListAndStringList<'static>, + ) -> Self { + Self::CPInversionListStrList(l) + } + + #[inline] + pub(crate) fn as_code_point_inversion_list_string_list( + &'_ self, + ) -> Option<&'_ CodePointInversionListAndStringList<'data>> { + match *self { + Self::CPInversionListStrList(ref l) => Some(l), + // any other backing data structure that cannot return a CPInversionListStrList in O(1) time should return None + } + } + + #[inline] + pub(crate) fn to_code_point_inversion_list_string_list( + &self, + ) -> CodePointInversionListAndStringList<'_> { + match *self { + Self::CPInversionListStrList(ref t) => ZeroFrom::zero_from(t), + } + } +} + +/// A struct that efficiently stores `Script` and `Script_Extensions` property data. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[icu_provider::data_struct(marker( + ScriptWithExtensionsPropertyV1Marker, + "props/scx@1", + singleton +))] +#[derive(Debug, Eq, PartialEq, Clone)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct ScriptWithExtensionsPropertyV1<'data> { + /// Note: The `ScriptWithExt` values in this array will assume a 12-bit layout. The 2 + /// higher order bits 11..10 will indicate how to deduce the Script value and + /// Script_Extensions value, nearly matching the representation + /// [in ICU](https://github.com/unicode-org/icu/blob/main/icu4c/source/common/uprops.h): + /// + /// | High order 2 bits value | Script | Script_Extensions | + /// |-------------------------|--------------------------------------------------------|----------------------------------------------------------------| + /// | 3 | First value in sub-array, index given by lower 10 bits | Sub-array excluding first value, index given by lower 10 bits | + /// | 2 | Script=Inherited | Entire sub-array, index given by lower 10 bits | + /// | 1 | Script=Common | Entire sub-array, index given by lower 10 bits | + /// | 0 | Value in lower 10 bits | `[ Script value ]` single-element array | + /// + /// When the lower 10 bits of the value are used as an index, that index is + /// used for the outer-level vector of the nested `extensions` structure. + #[cfg_attr(feature = "serde", serde(borrow))] + pub trie: CodePointTrie<'data, ScriptWithExt>, + + /// This companion structure stores Script_Extensions values, which are + /// themselves arrays / vectors. This structure only stores the values for + /// cases in which `scx(cp) != [ sc(cp) ]`. Each sub-vector is distinct. The + /// sub-vector represents the Script_Extensions array value for a code point, + /// and may also indicate Script value, as described for the `trie` field. + #[cfg_attr(feature = "serde", serde(borrow))] + pub extensions: VarZeroVec<'data, ZeroSlice<Script>>, +} + +impl<'data> ScriptWithExtensionsPropertyV1<'data> { + // This method is intended to be used by constructors of deserialized data + // in a data provider. + #[doc(hidden)] + pub fn new( + trie: CodePointTrie<'data, ScriptWithExt>, + extensions: VarZeroVec<'data, ZeroSlice<Script>>, + ) -> ScriptWithExtensionsPropertyV1<'data> { + ScriptWithExtensionsPropertyV1 { trie, extensions } + } +} + +// See CodePointSetData for documentation of these functions +impl<'data> PropertyCodePointSetV1<'data> { + #[inline] + pub(crate) fn contains(&self, ch: char) -> bool { + match *self { + Self::InversionList(ref l) => l.contains(ch), + } + } + + #[inline] + pub(crate) fn contains32(&self, ch: u32) -> bool { + match *self { + Self::InversionList(ref l) => l.contains32(ch), + } + } + + #[inline] + pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ { + match *self { + Self::InversionList(ref l) => l.iter_ranges(), + } + } + + #[inline] + pub(crate) fn iter_ranges_complemented( + &self, + ) -> impl Iterator<Item = RangeInclusive<u32>> + '_ { + match *self { + Self::InversionList(ref l) => l.iter_ranges_complemented(), + } + } + + #[inline] + pub(crate) fn from_code_point_inversion_list(l: CodePointInversionList<'static>) -> Self { + Self::InversionList(l) + } + + #[inline] + pub(crate) fn as_code_point_inversion_list( + &'_ self, + ) -> Option<&'_ CodePointInversionList<'data>> { + match *self { + Self::InversionList(ref l) => Some(l), + // any other backing data structure that cannot return a CPInvList in O(1) time should return None + } + } + + #[inline] + pub(crate) fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> { + match *self { + Self::InversionList(ref t) => ZeroFrom::zero_from(t), + } + } +} + +// See CodePointMapData for documentation of these functions +impl<'data, T: TrieValue> PropertyCodePointMapV1<'data, T> { + #[inline] + pub(crate) fn get32(&self, ch: u32) -> T { + match *self { + Self::CodePointTrie(ref t) => t.get32(ch), + } + } + + #[inline] + pub(crate) fn try_into_converted<P>( + self, + ) -> Result<PropertyCodePointMapV1<'data, P>, ZeroVecError> + where + P: TrieValue, + { + match self { + Self::CodePointTrie(t) => t + .try_into_converted() + .map(PropertyCodePointMapV1::CodePointTrie), + } + } + + #[inline] + pub(crate) fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> { + match *self { + Self::CodePointTrie(ref t) => t.get_set_for_value(value), + } + } + + #[inline] + pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = CodePointMapRange<T>> + '_ { + match *self { + Self::CodePointTrie(ref t) => t.iter_ranges(), + } + } + #[inline] + pub(crate) fn iter_ranges_mapped<'a, U: Eq + 'a>( + &'a self, + map: impl FnMut(T) -> U + Copy + 'a, + ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a { + match *self { + Self::CodePointTrie(ref t) => t.iter_ranges_mapped(map), + } + } + + #[inline] + pub(crate) fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self { + Self::CodePointTrie(trie) + } + + #[inline] + pub(crate) fn as_code_point_trie(&self) -> Option<&CodePointTrie<'data, T>> { + match *self { + Self::CodePointTrie(ref t) => Some(t), + // any other backing data structure that cannot return a CPT in O(1) time should return None + } + } + + #[inline] + pub(crate) fn to_code_point_trie(&self) -> CodePointTrie<'_, T> { + match *self { + Self::CodePointTrie(ref t) => ZeroFrom::zero_from(t), + } + } +} + +macro_rules! expand { + ( + ($(($code_point_set_marker:ident, $bin_cp_s:literal),)+), + ($(($unicode_set_marker:ident, $bin_us_s:literal, $us_singleton:literal),)+), + ($(($code_point_map_marker:ident, + $name_value_marker:ident, + + $((sparse: $value_short_name_marker_sparse:ident, $value_long_name_marker_sparse:ident),)? + $((linear: $value_short_name_marker_linear:ident, $value_long_name_marker_linear:ident ),)? + $((linear4: $value_short_name_marker_linear4:ident, $value_long_name_marker_linear4:ident ),)? + $enum_s:literal, $value_ty:ident),)+) + ) => { + + // Data keys that return code point sets (represented as CodePointSetData). + // For now, synonymous with binary properties of code points only. + $( + #[doc = core::concat!("Data marker for the '", $bin_cp_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $code_point_set_marker; + + impl DataMarker for $code_point_set_marker { + type Yokeable = PropertyCodePointSetV1<'static>; + } + impl KeyedDataMarker for $code_point_set_marker { + const KEY: DataKey = data_key!(concat!("props/", $bin_cp_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + + )+ + + // Data keys that return sets of strings + code points (represented as UnicodeSetData). + // Includes: + // - binary properties of strings + code points + // - exemplar characters + $( + #[doc = core::concat!("Data marker for the '", $bin_us_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $unicode_set_marker; + + impl DataMarker for $unicode_set_marker { + type Yokeable = PropertyUnicodeSetV1<'static>; + } + impl KeyedDataMarker for $unicode_set_marker { + const KEY: DataKey = data_key!(concat!("props/", $bin_us_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, $us_singleton)); + } + )+ + + // Data keys that return code point map (represented as CodePointMapData). + // For now, synonymous with enumerated properties [of code points only]. + $( + #[doc = core::concat!("Data marker for the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $code_point_map_marker; + + impl DataMarker for $code_point_map_marker { + type Yokeable = PropertyCodePointMapV1<'static, crate::$value_ty>; + } + + impl KeyedDataMarker for $code_point_map_marker { + const KEY: DataKey = data_key!(concat!("props/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + + + #[doc = core::concat!("Data marker for parsing the names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $name_value_marker; + + impl DataMarker for $name_value_marker { + type Yokeable = names::PropertyValueNameToEnumMapV1<'static>; + } + + impl KeyedDataMarker for $name_value_marker { + const KEY: DataKey = data_key!(concat!("propnames/from/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + + $( + #[doc = core::concat!("Data marker for producing short names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $value_short_name_marker_sparse; + + impl DataMarker for $value_short_name_marker_sparse { + type Yokeable = names::PropertyEnumToValueNameSparseMapV1<'static>; + } + + impl KeyedDataMarker for $value_short_name_marker_sparse { + const KEY: DataKey = data_key!(concat!("propnames/to/short/sparse/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + + #[doc = core::concat!("Data marker for producing long names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $value_long_name_marker_sparse; + + impl DataMarker for $value_long_name_marker_sparse { + type Yokeable = names::PropertyEnumToValueNameSparseMapV1<'static>; + } + + impl KeyedDataMarker for $value_long_name_marker_sparse { + const KEY: DataKey = data_key!(concat!("propnames/to/long/sparse/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + )? + + $( + #[doc = core::concat!("Data marker for producing short names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $value_short_name_marker_linear; + + impl DataMarker for $value_short_name_marker_linear { + type Yokeable = names::PropertyEnumToValueNameLinearMapV1<'static>; + } + + impl KeyedDataMarker for $value_short_name_marker_linear { + const KEY: DataKey = data_key!(concat!("propnames/to/short/linear/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + + #[doc = core::concat!("Data marker for producing long names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $value_long_name_marker_linear; + + impl DataMarker for $value_long_name_marker_linear { + type Yokeable = names::PropertyEnumToValueNameLinearMapV1<'static>; + } + + impl KeyedDataMarker for $value_long_name_marker_linear { + const KEY: DataKey = data_key!(concat!("propnames/to/long/linear/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + )? + + $( + #[doc = core::concat!("Data marker for producing short names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $value_short_name_marker_linear4; + + impl DataMarker for $value_short_name_marker_linear4 { + type Yokeable = names::PropertyEnumToValueNameLinearTiny4MapV1<'static>; + } + + impl KeyedDataMarker for $value_short_name_marker_linear4 { + const KEY: DataKey = data_key!(concat!("propnames/to/short/linear4/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + + #[doc = core::concat!("Data marker for producing long names of the values of the '", $enum_s, "' Unicode property")] + #[derive(Debug, Default)] + #[cfg_attr( + feature = "datagen", + derive(databake::Bake), + databake(path = icu_properties::provider), + )] + pub struct $value_long_name_marker_linear4; + + impl DataMarker for $value_long_name_marker_linear4 { + // Tiny4 is only for short names + type Yokeable = names::PropertyEnumToValueNameLinearMapV1<'static>; + } + + impl KeyedDataMarker for $value_long_name_marker_linear4 { + const KEY: DataKey = data_key!(concat!("propnames/to/long/linear/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true)); + } + )? + )+ + + /// All data keys in this module. + pub const KEYS: &[DataKey] = &[ + $($code_point_set_marker::KEY,)+ + $($unicode_set_marker::KEY,)+ + $( + $code_point_map_marker::KEY, + $name_value_marker::KEY, + $($value_short_name_marker_sparse::KEY, $value_long_name_marker_sparse::KEY,)? + $($value_short_name_marker_linear::KEY, $value_long_name_marker_linear::KEY,)? + $($value_short_name_marker_linear4::KEY, $value_long_name_marker_linear4::KEY,)? + )+ + bidi_data::BidiAuxiliaryPropertiesV1Marker::KEY, + GeneralCategoryMaskNameToValueV1Marker::KEY, + ScriptWithExtensionsPropertyV1Marker::KEY, + ]; + }; +} + +pub use self::names::GeneralCategoryMaskNameToValueV1Marker; + +expand!( + ( + // code point sets + (AsciiHexDigitV1Marker, "AHex"), + (AlnumV1Marker, "alnum"), + (AlphabeticV1Marker, "Alpha"), + (BidiControlV1Marker, "Bidi_C"), + (BidiMirroredV1Marker, "Bidi_M"), + (BlankV1Marker, "blank"), + (CasedV1Marker, "Cased"), + (CaseIgnorableV1Marker, "CI"), + (FullCompositionExclusionV1Marker, "Comp_Ex"), + (ChangesWhenCasefoldedV1Marker, "CWCF"), + (ChangesWhenCasemappedV1Marker, "CWCM"), + (ChangesWhenNfkcCasefoldedV1Marker, "CWKCF"), + (ChangesWhenLowercasedV1Marker, "CWL"), + (ChangesWhenTitlecasedV1Marker, "CWT"), + (ChangesWhenUppercasedV1Marker, "CWU"), + (DashV1Marker, "Dash"), + (DeprecatedV1Marker, "Dep"), + (DefaultIgnorableCodePointV1Marker, "DI"), + (DiacriticV1Marker, "Dia"), + (EmojiModifierBaseV1Marker, "EBase"), + (EmojiComponentV1Marker, "EComp"), + (EmojiModifierV1Marker, "EMod"), + (EmojiV1Marker, "Emoji"), + (EmojiPresentationV1Marker, "EPres"), + (ExtenderV1Marker, "Ext"), + (ExtendedPictographicV1Marker, "ExtPict"), + (GraphV1Marker, "graph"), + (GraphemeBaseV1Marker, "Gr_Base"), + (GraphemeExtendV1Marker, "Gr_Ext"), + (GraphemeLinkV1Marker, "Gr_Link"), + (HexDigitV1Marker, "Hex"), + (HyphenV1Marker, "Hyphen"), + (IdContinueV1Marker, "IDC"), + (IdeographicV1Marker, "Ideo"), + (IdStartV1Marker, "IDS"), + (IdsBinaryOperatorV1Marker, "IDSB"), + (IdsTrinaryOperatorV1Marker, "IDST"), + (JoinControlV1Marker, "Join_C"), + (LogicalOrderExceptionV1Marker, "LOE"), + (LowercaseV1Marker, "Lower"), + (MathV1Marker, "Math"), + (NoncharacterCodePointV1Marker, "NChar"), + (NfcInertV1Marker, "nfcinert"), + (NfdInertV1Marker, "nfdinert"), + (NfkcInertV1Marker, "nfkcinert"), + (NfkdInertV1Marker, "nfkdinert"), + (PatternSyntaxV1Marker, "Pat_Syn"), + (PatternWhiteSpaceV1Marker, "Pat_WS"), + (PrependedConcatenationMarkV1Marker, "PCM"), + (PrintV1Marker, "print"), + (QuotationMarkV1Marker, "QMark"), + (RadicalV1Marker, "Radical"), + (RegionalIndicatorV1Marker, "RI"), + (SoftDottedV1Marker, "SD"), + (SegmentStarterV1Marker, "segstart"), + (CaseSensitiveV1Marker, "Sensitive"), + (SentenceTerminalV1Marker, "STerm"), + (TerminalPunctuationV1Marker, "Term"), + (UnifiedIdeographV1Marker, "UIdeo"), + (UppercaseV1Marker, "Upper"), + (VariationSelectorV1Marker, "VS"), + (WhiteSpaceV1Marker, "WSpace"), + (XdigitV1Marker, "xdigit"), + (XidContinueV1Marker, "XIDC"), + (XidStartV1Marker, "XIDS"), + ), + ( + // UnicodeSets (code points + strings) + (BasicEmojiV1Marker, "Basic_Emoji", true), + (ExemplarCharactersMainV1Marker, "exemplarchars/main", false), + ( + ExemplarCharactersAuxiliaryV1Marker, + "exemplarchars/auxiliary", + false + ), + ( + ExemplarCharactersPunctuationV1Marker, + "exemplarchars/punctuation", + false + ), + ( + ExemplarCharactersNumbersV1Marker, + "exemplarchars/numbers", + false + ), + ( + ExemplarCharactersIndexV1Marker, + "exemplarchars/index", + false + ), + ), + ( + // code point maps + ( + CanonicalCombiningClassV1Marker, + CanonicalCombiningClassNameToValueV1Marker, + ( + sparse: CanonicalCombiningClassValueToShortNameV1Marker, + CanonicalCombiningClassValueToLongNameV1Marker + ), + "ccc", + CanonicalCombiningClass + ), + ( + GeneralCategoryV1Marker, + GeneralCategoryNameToValueV1Marker, + ( + linear: GeneralCategoryValueToShortNameV1Marker, + GeneralCategoryValueToLongNameV1Marker + ), + "gc", + GeneralCategory + ), + ( + BidiClassV1Marker, + BidiClassNameToValueV1Marker, + ( + linear: BidiClassValueToShortNameV1Marker, + BidiClassValueToLongNameV1Marker + ), + "bc", + BidiClass + ), + ( + ScriptV1Marker, + ScriptNameToValueV1Marker, + ( + linear4: ScriptValueToShortNameV1Marker, + ScriptValueToLongNameV1Marker + ), + "sc", + Script + ), + ( + EastAsianWidthV1Marker, + EastAsianWidthNameToValueV1Marker, + ( + linear: EastAsianWidthValueToShortNameV1Marker, + EastAsianWidthValueToLongNameV1Marker + ), + "ea", + EastAsianWidth + ), + ( + LineBreakV1Marker, + LineBreakNameToValueV1Marker, + ( + linear: LineBreakValueToShortNameV1Marker, + LineBreakValueToLongNameV1Marker + ), + "lb", + LineBreak + ), + ( + GraphemeClusterBreakV1Marker, + GraphemeClusterBreakNameToValueV1Marker, + ( + linear: GraphemeClusterBreakValueToShortNameV1Marker, + GraphemeClusterBreakValueToLongNameV1Marker + ), + "GCB", + GraphemeClusterBreak + ), + ( + WordBreakV1Marker, + WordBreakNameToValueV1Marker, + ( + linear: WordBreakValueToShortNameV1Marker, + WordBreakValueToLongNameV1Marker + ), + "WB", + WordBreak + ), + ( + SentenceBreakV1Marker, + SentenceBreakNameToValueV1Marker, + ( + linear: SentenceBreakValueToShortNameV1Marker, + SentenceBreakValueToLongNameV1Marker + ), + "SB", + SentenceBreak + ), + ( + IndicSyllabicCategoryV1Marker, + IndicSyllabicCategoryNameToValueV1Marker, + ( + linear: IndicSyllabicCategoryValueToShortNameV1Marker, + IndicSyllabicCategoryValueToLongNameV1Marker + ), + "InSC", + IndicSyllabicCategory + ), + // note: the names key for the GCM mask is handled above + ) +); diff --git a/third_party/rust/icu_properties/src/provider/bidi_data.rs b/third_party/rust/icu_properties/src/provider/bidi_data.rs new file mode 100644 index 0000000000..465ed4ebb7 --- /dev/null +++ b/third_party/rust/icu_properties/src/provider/bidi_data.rs @@ -0,0 +1,289 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. +//! +//! <div class="stab unstable"> +//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +//! including in SemVer minor releases. While the serde representation of data structs is guaranteed +//! to be stable, their Rust representation might not be. Use with caution. +//! </div> +//! +//! Read more about data providers: [`icu_provider`] +//! +//! This module provides an efficient storage of data serving the following +//! properties: +//! - `Bidi_Paired_Bracket` +//! - `Bidi_Paired_Bracket_Type` +//! - `Bidi_Mirrored` +//! - `Bidi_Mirroring_Glyph` + +use displaydoc::Display; +use icu_collections::codepointtrie::{CodePointTrie, TrieValue}; +use icu_provider::prelude::*; +use zerovec::ule::{AsULE, CharULE, ULE}; +use zerovec::ZeroVecError; + +/// A data provider struct for properties related to Bidi algorithms, including +/// mirroring and bracket pairing. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[icu_provider::data_struct(marker( + BidiAuxiliaryPropertiesV1Marker, + "props/bidiauxiliaryprops@1", + singleton +))] +#[derive(Debug, Eq, PartialEq, Clone)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider::bidi_data), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct BidiAuxiliaryPropertiesV1<'data> { + /// A `CodePointTrie` efficiently storing the data from which property values + /// can be extracted or derived for the supported Bidi properties. + #[cfg_attr(feature = "serde", serde(borrow))] + pub trie: CodePointTrie<'data, MirroredPairedBracketData>, +} + +impl<'data> BidiAuxiliaryPropertiesV1<'data> { + #[doc(hidden)] + pub fn new( + trie: CodePointTrie<'data, MirroredPairedBracketData>, + ) -> BidiAuxiliaryPropertiesV1<'data> { + BidiAuxiliaryPropertiesV1 { trie } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))] +#[doc(hidden)] // needed for datagen but not intended for users +pub struct MirroredPairedBracketData { + pub mirroring_glyph: char, + pub mirrored: bool, + pub paired_bracket_type: CheckedBidiPairedBracketType, +} + +impl Default for MirroredPairedBracketData { + fn default() -> Self { + Self { + mirroring_glyph: 0 as char, + mirrored: false, + paired_bracket_type: CheckedBidiPairedBracketType::None, + } + } +} + +impl From<MirroredPairedBracketData> for u32 { + fn from(mpbd: MirroredPairedBracketData) -> u32 { + let mut result = mpbd.mirroring_glyph as u32; + result |= (mpbd.mirrored as u32) << 21; + result |= (mpbd.paired_bracket_type as u32) << 22; + result + } +} + +/// A `u32` serialized value of `MirroredPairedBracketData` did not encode either a valid Bidi_Mirroring_Glyph or a valid Bidi_Paired_Bracket_Type +#[derive(Display, Debug, Clone, Copy, PartialEq, Eq)] +#[displaydoc("Invalid MirroredPairedBracketData serialized in int: {0}")] +pub struct MirroredPairedBracketDataTryFromError(u32); + +impl TryFrom<u32> for MirroredPairedBracketData { + type Error = MirroredPairedBracketDataTryFromError; + + fn try_from(i: u32) -> Result<Self, MirroredPairedBracketDataTryFromError> { + let code_point = i & 0x1FFFFF; + let mirroring_glyph = + char::try_from_u32(code_point).map_err(|_| MirroredPairedBracketDataTryFromError(i))?; + let mirrored = ((i >> 21) & 0x1) == 1; + let paired_bracket_type = { + let value = ((i >> 22) & 0x3) as u8; + match value { + 0 => CheckedBidiPairedBracketType::None, + 1 => CheckedBidiPairedBracketType::Open, + 2 => CheckedBidiPairedBracketType::Close, + _ => { + return Err(MirroredPairedBracketDataTryFromError(i)); + } + } + }; + Ok(MirroredPairedBracketData { + mirroring_glyph, + mirrored, + paired_bracket_type, + }) + } +} + +/// A closed Rust enum representing a closed set of the incoming Bidi_Paired_Bracket_Type +/// property values necessary in the internal representation of `MirroredPairedBracketData` +/// to satisfy the ULE invariants on valid values. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))] +#[repr(u8)] +#[zerovec::make_ule(CheckedBidiPairedBracketTypeULE)] +// This enum is closed in order to help with ULE validation for MirroredPairedBracketData. +#[allow(clippy::exhaustive_enums)] +pub enum CheckedBidiPairedBracketType { + /// Not a paired bracket. + None = 0, + /// Open paired bracket. + Open = 1, + /// Close paired bracket. + Close = 2, +} + +/// Bit layout for the 24 bits (0..=23) of the `[u8; 3]` ULE raw type. +/// LE means first byte is 0..=7, second byte 8..=15, third byte is 16..=23 +/// 0..=20 Code point return value for Bidi_Mirroring_Glyph value +/// extracted with: mask = 0x1FFFFF <=> [bytes[0], bytes[1], bytes[2] & 0x1F] +/// 21..=21 Boolean for Bidi_Mirrored +/// extracted with: bitshift right by 21 followed by mask = 0x1 <=> (bytes[2] >> 5) & 0x1 +/// 22..=23 Enum discriminant value for Bidi_Paired_Bracket_Type +/// extracted with: bitshift right by 22 followed by mask = 0x3 <=> (bytes[2] >> 6) & 0x3 +/// <=> (bytes[2] >> 6) b/c we left fill with 0s on bitshift right for unsigned +/// numbers and a byte has 8 bits +#[doc(hidden)] +/// needed for datagen but not intended for users +#[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)] +#[repr(packed)] +pub struct MirroredPairedBracketDataULE([u8; 3]); + +// Safety (based on the safety checklist on the ULE trait): +// 1. MirroredPairedBracketDataULE does not include any uninitialized or padding bytes +// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) +// 2. MirroredPairedBracketDataULE is aligned to 1 byte. +// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) +// 3. The impl of validate_byte_slice() returns an error if any byte is not valid. +// 4. The impl of validate_byte_slice() returns an error if there are extra bytes. +// 5. The other ULE methods use the default impl. +// 6. MirroredPairedBracketDataULE byte equality is semantic equality because all bits +// are used, so no unused bits requires no extra work to zero out unused bits +unsafe impl ULE for MirroredPairedBracketDataULE { + #[inline] + fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> { + if bytes.len() % 3 != 0 { + return Err(ZeroVecError::length::<Self>(bytes.len())); + } + // Validate the bytes + #[allow(clippy::indexing_slicing)] // Won't panic because the chunks are always 3 bytes long + for byte_triple in bytes.chunks_exact(3) { + // Bidi_Mirroring_Glyph validation + #[allow(clippy::unwrap_used)] // chunks_exact returns slices of length 3 + let [byte0, byte1, byte2] = *<&[u8; 3]>::try_from(byte_triple).unwrap(); + let mut mirroring_glyph_code_point: u32 = (byte2 & 0x1F) as u32; + mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte1 as u32); + mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte0 as u32); + let _mirroring_glyph = + char::from_u32(mirroring_glyph_code_point).ok_or(ZeroVecError::parse::<Self>())?; + + // skip validating the Bidi_Mirrored boolean since it is always valid + + // assert that Bidi_Paired_Bracket_Type cannot have a 4th value because it only + // has 3 values: Open, Close, None + if (byte2 & 0xC0) == 0xC0 { + return Err(ZeroVecError::parse::<Self>()); + } + } + + Ok(()) + } +} + +impl AsULE for MirroredPairedBracketData { + type ULE = MirroredPairedBracketDataULE; + + #[inline] + fn to_unaligned(self) -> Self::ULE { + let mut ch = u32::from(self.mirroring_glyph); + ch |= u32::from(self.mirrored) << 21; + ch |= (self.paired_bracket_type as u32) << 22; + let [byte0, byte1, byte2, _] = ch.to_le_bytes(); + MirroredPairedBracketDataULE([byte0, byte1, byte2]) + } + + #[inline] + fn from_unaligned(unaligned: Self::ULE) -> Self { + let [unaligned_byte0, unaligned_byte1, unaligned_byte2] = unaligned.0; + let mirroring_glyph_ule_bytes = &[unaligned_byte0, unaligned_byte1, unaligned_byte2 & 0x1F]; + // Safe because the lower bits 20..0 of MirroredPairedBracketDataULE bytes are the CharULE bytes, + // and CharULE::from_unaligned is safe because bytes are defined to represent a valid Unicode code point. + let mirroring_glyph_ule = + unsafe { CharULE::from_byte_slice_unchecked(mirroring_glyph_ule_bytes) }; + let mirroring_glyph = mirroring_glyph_ule + .first() + .map(|ule| char::from_unaligned(*ule)) + .unwrap_or(char::REPLACEMENT_CHARACTER); + let mirrored = ((unaligned.0[2] >> 5) & 0x1) == 1; + let paired_bracket_type = { + let discriminant = unaligned.0[2] >> 6; + debug_assert!( + discriminant != 3, + "Bidi_Paired_Bracket_Type can only be Open/Close/None in MirroredPairedBracketData" + ); + match discriminant { + 1 => CheckedBidiPairedBracketType::Open, + 2 => CheckedBidiPairedBracketType::Close, + _ => CheckedBidiPairedBracketType::None, + } + }; + + MirroredPairedBracketData { + mirroring_glyph, + mirrored, + paired_bracket_type, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse() { + // data for U+007B LEFT CURLY BRACKET + + // serialize to ULE bytes + let data = MirroredPairedBracketData { + mirroring_glyph: '}', + mirrored: true, + paired_bracket_type: CheckedBidiPairedBracketType::Open, + }; + let expected_bytes = &[0x7D, 0x0, 0x60]; + assert_eq!( + expected_bytes, + MirroredPairedBracketDataULE::as_byte_slice(&[data.to_unaligned()]) + ); + + // deserialize from ULE bytes + let ule = MirroredPairedBracketDataULE::parse_byte_slice(expected_bytes).unwrap(); + let parsed_data = MirroredPairedBracketData::from_unaligned(*ule.first().unwrap()); + assert_eq!(data, parsed_data); + } + + #[test] + fn test_parse_error() { + // data for U+007B LEFT CURLY BRACKET + let ule_bytes = &mut [0x7D, 0x0, 0x60]; + + // Set discriminant value for the CheckedBidiPairedBracketType enum to be invalid. + // CheckedBidiPairedBracketType only has 3 values (discriminants => 0..=2), so the 4th + // expressible value from the 2 bits (3) should not parse successfully. + ule_bytes[2] |= 0xC0; + + // deserialize from ULE bytes + let ule_parse_result = MirroredPairedBracketDataULE::parse_byte_slice(ule_bytes); + assert!(ule_parse_result.is_err()); + } +} diff --git a/third_party/rust/icu_properties/src/provider/names.rs b/third_party/rust/icu_properties/src/provider/names.rs new file mode 100644 index 0000000000..f521f715ce --- /dev/null +++ b/third_party/rust/icu_properties/src/provider/names.rs @@ -0,0 +1,277 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! 🚧 \[Unstable\] Property names-related data for this component +//! +//! <div class="stab unstable"> +//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +//! including in SemVer minor releases. While the serde representation of data structs is guaranteed +//! to be stable, their Rust representation might not be. Use with caution. +//! </div> +//! +//! Read more about data providers: [`icu_provider`] + +use alloc::boxed::Box; +use core::cmp::Ordering; + +use core::str; + +use icu_provider::prelude::*; + +use tinystr::TinyStr4; +use zerovec::ule::{UnvalidatedStr, VarULE}; +use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap, ZeroVec}; + +/// This is a property name that can be "loose matched" as according to +/// [PropertyValueAliases.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt) +/// +/// (matched case-insensitively in ASCII, ignoring underscores, whitespace, and hyphens) +/// +/// This is expected to be ASCII, but we do not rely on this invariant anywhere except during +/// datagen. +/// +/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items +/// will sort into the same area, such that a map can be searched for both strict and loose equality. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +/// +/// # Examples +/// +/// Using a [`NormalizedPropertyNameStr`] as the key of a [`ZeroMap`]: +/// +/// ``` +/// use icu_properties::provider::names::NormalizedPropertyNameStr; +/// use zerovec::ZeroMap; +/// +/// let map: ZeroMap<NormalizedPropertyNameStr, usize> = [ +/// (NormalizedPropertyNameStr::from_str("A_BC"), 11), +/// (NormalizedPropertyNameStr::from_str("dEf"), 22), +/// (NormalizedPropertyNameStr::from_str("G_H-I"), 33), +/// ] +/// .into_iter() +/// .collect(); +/// +/// let key_approx = NormalizedPropertyNameStr::from_str("AB-C"); +/// let key_exact = NormalizedPropertyNameStr::from_str("A_BC"); +/// +/// // Strict lookup: +/// assert_eq!(None, map.get_copied(key_approx)); +/// assert_eq!(Some(11), map.get_copied(key_exact)); +/// +/// // Loose lookup: +/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_approx))); +/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_exact))); +/// ``` +#[derive(PartialEq, Eq)] // VarULE wants these to be byte equality +#[derive(Debug, VarULE)] +#[cfg_attr(feature = "serde", derive(serde::Serialize))] +#[repr(transparent)] +pub struct NormalizedPropertyNameStr(UnvalidatedStr); + +/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for Box<NormalizedPropertyNameStr> { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + <Box<UnvalidatedStr>>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_box) + } +} + +/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate +#[cfg(feature = "serde")] +impl<'de, 'a> serde::Deserialize<'de> for &'a NormalizedPropertyNameStr +where + 'de: 'a, +{ + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + <&UnvalidatedStr>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_ref) + } +} + +impl<'a> ZeroMapKV<'a> for NormalizedPropertyNameStr { + type Container = VarZeroVec<'a, NormalizedPropertyNameStr>; + type Slice = VarZeroSlice<NormalizedPropertyNameStr>; + type GetType = NormalizedPropertyNameStr; + type OwnedType = Box<NormalizedPropertyNameStr>; +} + +/// The Ord/PartialOrd impl will sort things using strict equality, but in such a way that all loose-equal items +/// will sort into the same area, such that a map can be searched for both strict and loose equality. +impl PartialOrd for NormalizedPropertyNameStr { + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + Some(self.cmp(other)) + } +} + +/// Normalize a character based on the "loose matching" described in PropertyValueAliases.txt, +/// returning `None` for skippable characters +/// +/// ICU has [code for this][1] (and [during property lookup][2]) which we emulate. +/// In particular, ICU only does normalization within ASCII, which makes sense since character names +/// seem to be only ASCII. +/// +/// [1]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L35 +/// [2]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L226-L230 +fn normalize_char(ch: u8) -> Option<u8> { + match ch { + // all ascii whitespace + ch if ch.is_ascii_whitespace() => None, + // underscores, hyphens, and the vertical tab character + // not covered by is_ascii_whitespace() + b'_' | b'-' | 0x0B => None, + // ignore case by lowercasing + ch => Some(ch.to_ascii_lowercase()), + } +} + +/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items +/// will sort into the same area, such that a map can be searched for both strict and loose equality. +impl Ord for NormalizedPropertyNameStr { + fn cmp(&self, other: &Self) -> Ordering { + let cmp = self.cmp_loose(other); + // When loose equality holds, fall back to strict equality + if cmp == Ordering::Equal { + self.0.cmp(&other.0) + } else { + cmp + } + } +} + +impl NormalizedPropertyNameStr { + /// Perform the loose comparison as defined in [`NormalizedPropertyNameStr`]. + pub fn cmp_loose(&self, other: &Self) -> Ordering { + let self_iter = self.0.iter().copied().filter_map(normalize_char); + let other_iter = other.0.iter().copied().filter_map(normalize_char); + self_iter.cmp(other_iter) + } + + /// Convert a string reference to a [`NormalizedPropertyNameStr`]. + pub const fn from_str(s: &str) -> &Self { + Self::cast_ref(UnvalidatedStr::from_str(s)) + } + + /// Convert a [`UnvalidatedStr`] reference to a [`NormalizedPropertyNameStr`] reference. + pub const fn cast_ref(value: &UnvalidatedStr) -> &Self { + // Safety: repr(transparent) + unsafe { core::mem::transmute(value) } + } + + /// Convert a [`UnvalidatedStr`] box to a [`NormalizedPropertyNameStr`] box. + pub const fn cast_box(value: Box<UnvalidatedStr>) -> Box<Self> { + // Safety: repr(transparent) + unsafe { core::mem::transmute(value) } + } + + /// Get a [`NormalizedPropertyNameStr`] box from a byte slice. + pub fn boxed_from_bytes(b: &[u8]) -> Box<Self> { + Self::cast_box(UnvalidatedStr::from_boxed_bytes(b.into())) + } +} + +/// A set of characters and strings which share a particular property value. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, Clone, PartialEq)] +#[icu_provider::data_struct(marker( + GeneralCategoryMaskNameToValueV1Marker, + "propnames/from/gcm@1", + singleton, +))] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider::names), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[yoke(prove_covariance_manually)] +pub struct PropertyValueNameToEnumMapV1<'data> { + /// A map from names to their value discriminant + #[cfg_attr(feature = "serde", serde(borrow))] + pub map: ZeroMap<'data, NormalizedPropertyNameStr, u16>, +} + +/// A mapping of property values to their names. A single instance of this map will only cover +/// either long or short names, determined whilst loading data. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, Clone, PartialEq)] +#[icu_provider::data_struct] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider::names), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[yoke(prove_covariance_manually)] +pub struct PropertyEnumToValueNameSparseMapV1<'data> { + /// A map from the value discriminant to the names + #[cfg_attr(feature = "serde", serde(borrow))] + pub map: ZeroMap<'data, u16, str>, +} + +/// A mapping of property values to their names. A single instance of this map will only cover +/// either long or short names, determined whilst loading data. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, Clone, PartialEq)] +#[icu_provider::data_struct] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider::names), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[yoke(prove_covariance_manually)] +pub struct PropertyEnumToValueNameLinearMapV1<'data> { + /// A map from the value discriminant (the index) to the names, for mostly + /// contiguous data. Empty strings count as missing. + #[cfg_attr(feature = "serde", serde(borrow))] + pub map: VarZeroVec<'data, str>, +} + +/// A mapping of property values to their names. A single instance of this map will only cover +/// either long or short names, determined whilst loading data. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, Clone, PartialEq)] +#[icu_provider::data_struct] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_properties::provider::names), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[yoke(prove_covariance_manually)] +pub struct PropertyEnumToValueNameLinearTiny4MapV1<'data> { + /// A map from the value discriminant (the index) to the names, for mostly + /// contiguous data. Empty strings count as missing. + #[cfg_attr(feature = "serde", serde(borrow))] + pub map: ZeroVec<'data, TinyStr4>, +} diff --git a/third_party/rust/icu_properties/src/runtime.rs b/third_party/rust/icu_properties/src/runtime.rs new file mode 100644 index 0000000000..79307dd6f1 --- /dev/null +++ b/third_party/rust/icu_properties/src/runtime.rs @@ -0,0 +1,360 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you +//! have a use case for this! +//! +//! This module contains utilities for working with properties where the specific property in use +//! is not known at compile time. +//! +//! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working +//! with properties at runtime tailored for the use case of ECMA262-compatible regex engines. + +#[cfg(doc)] +use crate::{maps, script, GeneralCategory, GeneralCategoryGroup, Script}; + +/// This type can represent any Unicode property. +/// +/// This is intended to be used in situations where the exact unicode property needed is +/// only known at runtime, for example in regex engines. +/// +/// The values are intended to be identical to ICU4C's UProperty enum +#[allow(clippy::exhaustive_structs)] // newtype +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub struct UnicodeProperty(pub u32); + +#[allow(non_upper_case_globals)] +#[allow(unused)] // experimental, may be made public later +impl UnicodeProperty { + /// Binary property `Alphabetic` + pub const Alphabetic: Self = UnicodeProperty(0); + /// Binary property `ASCII_Hex_Digit` + pub const AsciiHexDigit: Self = UnicodeProperty(1); + /// Binary property `Bidi_Control` + pub const BidiControl: Self = UnicodeProperty(2); + /// Binary property `Bidi_Mirrored` + pub const BidiMirrored: Self = UnicodeProperty(3); + /// Binary property `Dash` + pub const Dash: Self = UnicodeProperty(4); + /// Binary property `Default_Ignorable_Code_Point` + pub const DefaultIgnorableCodePoint: Self = UnicodeProperty(5); + /// Binary property `Deprecated` + pub const Deprecated: Self = UnicodeProperty(6); + /// Binary property `Diacritic` + pub const Diacritic: Self = UnicodeProperty(7); + /// Binary property `Extender` + pub const Extender: Self = UnicodeProperty(8); + /// Binary property `Full_Composition_Exclusion` + pub const FullCompositionExclusion: Self = UnicodeProperty(9); + /// Binary property `Grapheme_Base` + pub const GraphemeBase: Self = UnicodeProperty(10); + /// Binary property `Grapheme_Extend` + pub const GraphemeExtend: Self = UnicodeProperty(11); + /// Binary property `Grapheme_Link` + pub const GraphemeLink: Self = UnicodeProperty(12); + /// Binary property `Hex_Digit` + pub const HexDigit: Self = UnicodeProperty(13); + /// Binary property `Hyphen` + pub const Hyphen: Self = UnicodeProperty(14); + /// Binary property `ID_Continue` + pub const IdContinue: Self = UnicodeProperty(15); + /// Binary property `ID_Start` + pub const IdStart: Self = UnicodeProperty(16); + /// Binary property `Ideographic` + pub const Ideographic: Self = UnicodeProperty(17); + /// Binary property `IDS_Binary_Operator` + pub const IdsBinaryOperator: Self = UnicodeProperty(18); + /// Binary property `IDS_Trinary_Operator` + pub const IdsTrinaryOperator: Self = UnicodeProperty(19); + /// Binary property `Join_Control` + pub const JoinControl: Self = UnicodeProperty(20); + /// Binary property `Logical_Order_Exception` + pub const LogicalOrderException: Self = UnicodeProperty(21); + /// Binary property `Lowercase` + pub const Lowercase: Self = UnicodeProperty(22); + /// Binary property `Math` + pub const Math: Self = UnicodeProperty(23); + /// Binary property `Noncharacter_Code_Point` + pub const NoncharacterCodePoint: Self = UnicodeProperty(24); + /// Binary property `Quotation_Mark` + pub const QuotationMark: Self = UnicodeProperty(25); + /// Binary property `Radical` + pub const Radical: Self = UnicodeProperty(26); + /// Binary property `Soft_Dotted` + pub const SoftDotted: Self = UnicodeProperty(27); + /// Binary property `Terminal_Punctuation` + pub const TerminalPunctuation: Self = UnicodeProperty(28); + /// Binary property `Unified_Ideograph` + pub const UnifiedIdeograph: Self = UnicodeProperty(29); + /// Binary property `Uppercase` + pub const Uppercase: Self = UnicodeProperty(30); + /// Binary property `White_Space` + pub const WhiteSpace: Self = UnicodeProperty(31); + /// Binary property `XID_Continue` + pub const XidContinue: Self = UnicodeProperty(32); + /// Binary property `XID_Start` + pub const XidStart: Self = UnicodeProperty(33); + /// Binary property `Case_Sensitive` + pub const CaseSensitive: Self = UnicodeProperty(34); + /// Binary property `Sentence_Terminal` + pub const SentenceTerminal: Self = UnicodeProperty(35); + /// Binary property `Variation_Selector` + pub const VariationSelector: Self = UnicodeProperty(36); + /// Binary property `NFD_Inert` + pub const NfdInert: Self = UnicodeProperty(37); + /// Binary property `NFKD_Inert` + pub const NfkdInert: Self = UnicodeProperty(38); + /// Binary property `NFC_Inert` + pub const NfcInert: Self = UnicodeProperty(39); + /// Binary property `NFKC_Inert` + pub const NfkcInert: Self = UnicodeProperty(40); + /// Binary property `Segment_Starter` + pub const SegmentStarter: Self = UnicodeProperty(41); + /// Binary property `Pattern_Syntax` + pub const PatternSyntax: Self = UnicodeProperty(42); + /// Binary property `Pattern_White_Space` + pub const PatternWhiteSpace: Self = UnicodeProperty(43); + /// Binary property `alnum` + pub const Alnum: Self = UnicodeProperty(44); + /// Binary property `blank` + pub const Blank: Self = UnicodeProperty(45); + /// Binary property `graph` + pub const Graph: Self = UnicodeProperty(46); + /// Binary property `print` + pub const Print: Self = UnicodeProperty(47); + /// Binary property `xdigit` + pub const XDigit: Self = UnicodeProperty(48); + /// Binary property `Cased` + pub const Cased: Self = UnicodeProperty(49); + /// Binary property `Case_Ignorable` + pub const CaseIgnorable: Self = UnicodeProperty(50); + /// Binary property `Changes_When_Lowercased` + pub const ChangesWhenLowercased: Self = UnicodeProperty(51); + /// Binary property `Changes_When_Uppercased` + pub const ChangesWhenUppercased: Self = UnicodeProperty(52); + /// Binary property `Changes_When_Titlecased` + pub const ChangesWhenTitlecased: Self = UnicodeProperty(53); + /// Binary property `Changes_When_Casefolded` + pub const ChangesWhenCasefolded: Self = UnicodeProperty(54); + /// Binary property `Changes_When_Casemapped` + pub const ChangesWhenCasemapped: Self = UnicodeProperty(55); + /// Binary property `Changes_When_NFKC_Casefolded` + pub const ChangesWhenNfkcCasefolded: Self = UnicodeProperty(56); + /// Binary property `Emoji` + pub const Emoji: Self = UnicodeProperty(57); + /// Binary property `Emoji_Presentation` + pub const EmojiPresentation: Self = UnicodeProperty(58); + /// Binary property `Emoji_Modifier` + pub const EmojiModifier: Self = UnicodeProperty(59); + /// Binary property `Emoji_Modifier_Base` + pub const EmojiModifierBase: Self = UnicodeProperty(60); + /// Binary property `Emoji_Component` + pub const EmojiComponent: Self = UnicodeProperty(61); + /// Binary property `Regional_Indicator` + pub const RegionalIndicator: Self = UnicodeProperty(62); + /// Binary property `Prepended_Concatenation_Mark` + pub const PrependedConcatenationMark: Self = UnicodeProperty(63); + /// Binary property `Extended_Pictographic` + pub const ExtendedPictographic: Self = UnicodeProperty(64); + /// Binary property `Basic_Emoji` + pub const BasicEmoji: Self = UnicodeProperty(65); + /// Binary property `Emoji_Keycap_Sequence` + pub const EmojiKeycapSequence: Self = UnicodeProperty(66); + /// Binary property `RGI_Emoji_Modifier_Sequence` + pub const RgiEmojiModifierSequence: Self = UnicodeProperty(67); + /// Binary property `RGI_Emoji_Flag_Sequence` + pub const RgiEmojiFlagSequence: Self = UnicodeProperty(68); + /// Binary property `RGI_Emoji_Tag_Sequence` + pub const RgiEmojiTagSequence: Self = UnicodeProperty(69); + /// Binary property `RGI_Emoji_ZWJ_Sequence` + pub const RgiEmojiZWJSequence: Self = UnicodeProperty(70); + /// Binary property `RGI_Emoji` + pub const RgiEmoji: Self = UnicodeProperty(71); + + const BINARY_MAX: Self = Self::RgiEmoji; + + /// Enumerated property `Bidi_Class` + pub const BidiClass: Self = UnicodeProperty(0x1000); + /// Enumerated property `Block` + pub const Block: Self = UnicodeProperty(0x1001); + /// Enumerated property `Canonical_Combining_Class` + pub const CombiningClass: Self = UnicodeProperty(0x1002); + /// Enumerated property `Decomposition_Type` + pub const DecompositionType: Self = UnicodeProperty(0x1003); + /// Enumerated property `East_Asian_Width` + pub const EastAsianWidth: Self = UnicodeProperty(0x1004); + /// Enumerated property `General_Category` + pub const GeneralCategory: Self = UnicodeProperty(0x1005); + /// Enumerated property `Joining_Group` + pub const JoiningGroup: Self = UnicodeProperty(0x1006); + /// Enumerated property `Joining_Type` + pub const JoiningType: Self = UnicodeProperty(0x1007); + /// Enumerated property `Line_Break` + pub const LineBreak: Self = UnicodeProperty(0x1008); + /// Enumerated property `Numeric_Type` + pub const NumericType: Self = UnicodeProperty(0x1009); + /// Enumerated property `Script` + pub const Script: Self = UnicodeProperty(0x100A); + /// Enumerated property `Hangul_Syllable_Type` + pub const HangulSyllableType: Self = UnicodeProperty(0x100B); + /// Enumerated property `NFD_Quick_Check` + pub const NFDQuickCheck: Self = UnicodeProperty(0x100C); + /// Enumerated property `NFKD_Quick_Check` + pub const NFKDQuickCheck: Self = UnicodeProperty(0x100D); + /// Enumerated property `NFC_Quick_Check` + pub const NFCQuickCheck: Self = UnicodeProperty(0x100E); + /// Enumerated property `NFKC_Quick_Check` + pub const NFKCQuickCheck: Self = UnicodeProperty(0x100F); + /// Enumerated property `Lead_Canonical_Combining_Class` + pub const LeadCanonicalCombiningClass: Self = UnicodeProperty(0x1010); + /// Enumerated property `Trail_Canonical_Combining_Class` + pub const TrailCanonicalCombiningClass: Self = UnicodeProperty(0x1011); + /// Enumerated property `Grapheme_Cluster_Break` + pub const GraphemeClusterBreak: Self = UnicodeProperty(0x1012); + /// Enumerated property `Sentence_Break` + pub const SentenceBreak: Self = UnicodeProperty(0x1013); + /// Enumerated property `Word_Break` + pub const WordBreak: Self = UnicodeProperty(0x1014); + /// Enumerated property `Bidi_Paired_Bracket_Type` + pub const BidiPairedBracketType: Self = UnicodeProperty(0x1015); + /// Enumerated property `Indic_Positional_Category` + pub const IndicPositionalCategory: Self = UnicodeProperty(0x1016); + /// Enumerated property `Indic_Syllabic_Category` + pub const IndicSyllabicCategory: Self = UnicodeProperty(0x1017); + /// Enumerated property `Vertical_Orientation` + pub const VerticalOrientation: Self = UnicodeProperty(0x1018); + + const ENUMERATED_MAX: Self = Self::VerticalOrientation; + + /// Mask property `General_Category_Mask` + pub const GeneralCategoryMask: Self = UnicodeProperty(0x2000); + + /// Double property `Numeric_Value` + pub const NumericValue: Self = UnicodeProperty(0x3000); + + /// String property `Age` + pub const Age: Self = UnicodeProperty(0x4000); + /// String property `Bidi_Mirroring_Glyph` + pub const BidiMirroringGlyph: Self = UnicodeProperty(0x4001); + /// String property `Case_Folding` + pub const CaseFolding: Self = UnicodeProperty(0x4002); + /// String property `ISO_Comment` + pub const ISOComment: Self = UnicodeProperty(0x4003); + /// String property `Lowercase_Mapping` + pub const LowercaseMapping: Self = UnicodeProperty(0x4004); + /// String property `Name` + pub const Name: Self = UnicodeProperty(0x4005); + /// String property `Simple_Case_Folding` + pub const SimpleCaseFolding: Self = UnicodeProperty(0x4006); + /// String property `Simple_Lowercase_Mapping` + pub const SimpleLowercaseMapping: Self = UnicodeProperty(0x4007); + /// String property `Simple_Titlecase_Mapping` + pub const SimpleTitlecaseMapping: Self = UnicodeProperty(0x4008); + /// String property `Simple_Uppercase_Mapping` + pub const SimpleUppercaseMapping: Self = UnicodeProperty(0x4009); + /// String property `Titlecase_Mapping` + pub const TitlecaseMapping: Self = UnicodeProperty(0x400A); + /// String property `Unicode_1_Name` + pub const Unicode1_Name: Self = UnicodeProperty(0x400B); + /// String property `Uppercase_Mapping` + pub const UppercaseMapping: Self = UnicodeProperty(0x400C); + /// String property `Bidi_Paired_Bracket` + pub const BidiPairedBracket: Self = UnicodeProperty(0x400D); + + const STRING_MAX: Self = Self::BidiPairedBracket; + + /// Misc property `Script_Extensions` + pub const ScriptExtensions: Self = UnicodeProperty(0x7000); +} + +#[allow(unused)] // experimental, may be made public later +impl UnicodeProperty { + /// Given a property name (long, short, or alias), returns the corresponding [`UnicodeProperty`] + /// value for it provided it belongs to the [subset relevant for ECMA262 regexes][subset] + /// + /// Returns none if the name does not match any of the names in this subset. Performs + /// strict matching of names. + /// + /// If using this to implement an ECMA262-compliant regex engine, please note these caveats: + /// + /// - This only returns binary and enumerated properties, as well as [`Self::ScriptExtensions`]. + /// Lookup can be performed sufficiently with [`Self::load_ecma262_binary_property_unstable()`], + /// [`maps::load_general_category()`], [`maps::load_script()`] and [`script::load_script_with_extensions_unstable()`]. + /// - This does not handle the `Any`, `Assigned`, or `ASCII` pseudoproperties, since they are not + /// defined as properties. + /// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]` + /// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`). + /// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]` + /// - ECMA262 regexes transparently allow `General_Category_Mask` values for `GeneralCategory`. + /// This method does not return [`Self::GeneralCategoryMask`], and instead relies on the caller to use mask-related lookup + /// functions where necessary. + /// - ECMA262 regexes allow treating `General_Category` (and `gcm`) values as binary properties, + /// e.g. you can do things like `\p{Lu}` as shortform for `\p{gc=Lu}`. This method does not do so + /// since these are property values, not properties, but you can use + /// [`GeneralCategory::get_name_to_enum_mapper()`] or [`GeneralCategoryGroup::get_name_to_enum_mapper()`] + /// to handle this. + /// + /// + /// [subset]: https://tc39.es/ecma262/#table-nonbinary-unicode-properties + pub fn parse_ecma262_name(name: &str) -> Option<Self> { + let prop = match name { + "General_Category" | "gc" => Self::GeneralCategory, + "Script" | "sc" => Self::Script, + "Script_Extensions" | "scx" => Self::ScriptExtensions, + "ASCII_Hex_Digit" | "AHex" => Self::AsciiHexDigit, + "Alphabetic" | "Alpha" => Self::Alphabetic, + "Bidi_Control" | "Bidi_C" => Self::BidiControl, + "Bidi_Mirrored" | "Bidi_M" => Self::BidiMirrored, + "Case_Ignorable" | "CI" => Self::CaseIgnorable, + "Cased" => Self::Cased, + "Changes_When_Casefolded" | "CWCF" => Self::ChangesWhenCasefolded, + "Changes_When_Casemapped" | "CWCM" => Self::ChangesWhenCasemapped, + "Changes_When_Lowercased" | "CWL" => Self::ChangesWhenLowercased, + "Changes_When_NFKC_Casefolded" | "CWKCF" => Self::ChangesWhenNfkcCasefolded, + "Changes_When_Titlecased" | "CWT" => Self::ChangesWhenTitlecased, + "Changes_When_Uppercased" | "CWU" => Self::ChangesWhenUppercased, + "Dash" => Self::Dash, + "Default_Ignorable_Code_Point" | "DI" => Self::DefaultIgnorableCodePoint, + "Deprecated" | "Dep" => Self::Deprecated, + "Diacritic" | "Dia" => Self::Diacritic, + "Emoji" => Self::Emoji, + "Emoji_Component" | "EComp" => Self::EmojiComponent, + "Emoji_Modifier" | "EMod" => Self::EmojiModifier, + "Emoji_Modifier_Base" | "EBase" => Self::EmojiModifierBase, + "Emoji_Presentation" | "EPres" => Self::EmojiPresentation, + "Extended_Pictographic" | "ExtPict" => Self::ExtendedPictographic, + "Extender" | "Ext" => Self::Extender, + "Grapheme_Base" | "Gr_Base" => Self::GraphemeBase, + "Grapheme_Extend" | "Gr_Ext" => Self::GraphemeExtend, + "Hex_Digit" | "Hex" => Self::HexDigit, + "IDS_Binary_Operator" | "IDSB" => Self::IdsBinaryOperator, + "IDS_Trinary_Operator" | "IDST" => Self::IdsTrinaryOperator, + "ID_Continue" | "IDC" => Self::IdContinue, + "ID_Start" | "IDS" => Self::IdStart, + "Ideographic" | "Ideo" => Self::Ideographic, + "Join_Control" | "Join_C" => Self::JoinControl, + "Logical_Order_Exception" | "LOE" => Self::LogicalOrderException, + "Lowercase" | "Lower" => Self::Lowercase, + "Math" => Self::Math, + "Noncharacter_Code_Point" | "NChar" => Self::NoncharacterCodePoint, + "Pattern_Syntax" | "Pat_Syn" => Self::PatternSyntax, + "Pattern_White_Space" | "Pat_WS" => Self::PatternWhiteSpace, + "Quotation_Mark" | "QMark" => Self::QuotationMark, + "Radical" => Self::Radical, + "Regional_Indicator" | "RI" => Self::RegionalIndicator, + "Sentence_Terminal" | "STerm" => Self::SentenceTerminal, + "Soft_Dotted" | "SD" => Self::SoftDotted, + "Terminal_Punctuation" | "Term" => Self::TerminalPunctuation, + "Unified_Ideograph" | "UIdeo" => Self::UnifiedIdeograph, + "Uppercase" | "Upper" => Self::Uppercase, + "Variation_Selector" | "VS" => Self::VariationSelector, + "White_Space" | "space" => Self::WhiteSpace, + "XID_Continue" | "XIDC" => Self::XidContinue, + "XID_Start" | "XIDS" => Self::XidStart, + _ => return None, + }; + + Some(prop) + } +} diff --git a/third_party/rust/icu_properties/src/script.rs b/third_party/rust/icu_properties/src/script.rs new file mode 100644 index 0000000000..7e2595a4c4 --- /dev/null +++ b/third_party/rust/icu_properties/src/script.rs @@ -0,0 +1,648 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Data and APIs for supporting both Script and Script_Extensions property +//! values in an efficient structure. + +use crate::error::PropertiesError; +use crate::props::Script; +use crate::props::ScriptULE; +use crate::provider::*; + +use core::iter::FromIterator; +use core::ops::RangeInclusive; +use icu_collections::codepointinvlist::CodePointInversionList; +use icu_provider::prelude::*; +use zerovec::{ule::AsULE, ZeroSlice}; + +/// The number of bits at the low-end of a `ScriptWithExt` value used for +/// storing the `Script` value (or `extensions` index). +const SCRIPT_VAL_LENGTH: u16 = 10; + +/// The bit mask necessary to retrieve the `Script` value (or `extensions` index) +/// from a `ScriptWithExt` value. +const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1; + +/// An internal-use only pseudo-property that represents the values stored in +/// the trie of the special data structure [`ScriptWithExtensionsPropertyV1`]. +/// +/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions +/// 11..10 will indicate how to deduce the Script value and Script_Extensions, +/// and the lower 10 bits 9..0 indicate either the Script value or the index +/// into the `extensions` structure. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_properties::script))] +#[repr(transparent)] +#[doc(hidden)] +// `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsPropertyV1` constructor +#[allow(clippy::exhaustive_structs)] // this type is stable +pub struct ScriptWithExt(pub u16); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsPropertyV1` constructor +impl ScriptWithExt { + pub const Unknown: ScriptWithExt = ScriptWithExt(0); +} + +impl AsULE for ScriptWithExt { + type ULE = ScriptULE; + + #[inline] + fn to_unaligned(self) -> Self::ULE { + Script(self.0).to_unaligned() + } + + #[inline] + fn from_unaligned(unaligned: Self::ULE) -> Self { + ScriptWithExt(Script::from_unaligned(unaligned).0) + } +} + +#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsPropertyV1` constructor +impl ScriptWithExt { + /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and + /// also indicates a Script value of [`Script::Common`]. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::script::ScriptWithExt; + /// + /// assert!(ScriptWithExt(0x04FF).is_common()); + /// assert!(ScriptWithExt(0x0400).is_common()); + /// + /// assert!(!ScriptWithExt(0x08FF).is_common()); + /// assert!(!ScriptWithExt(0x0800).is_common()); + /// + /// assert!(!ScriptWithExt(0x0CFF).is_common()); + /// assert!(!ScriptWithExt(0x0C00).is_common()); + /// + /// assert!(!ScriptWithExt(0xFF).is_common()); + /// assert!(!ScriptWithExt(0x0).is_common()); + /// ``` + pub fn is_common(&self) -> bool { + self.0 >> SCRIPT_VAL_LENGTH == 1 + } + + /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and + /// also indicates a Script value of [`Script::Inherited`]. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::script::ScriptWithExt; + /// + /// assert!(!ScriptWithExt(0x04FF).is_inherited()); + /// assert!(!ScriptWithExt(0x0400).is_inherited()); + /// + /// assert!(ScriptWithExt(0x08FF).is_inherited()); + /// assert!(ScriptWithExt(0x0800).is_inherited()); + /// + /// assert!(!ScriptWithExt(0x0CFF).is_inherited()); + /// assert!(!ScriptWithExt(0x0C00).is_inherited()); + /// + /// assert!(!ScriptWithExt(0xFF).is_inherited()); + /// assert!(!ScriptWithExt(0x0).is_inherited()); + /// ``` + pub fn is_inherited(&self) -> bool { + self.0 >> SCRIPT_VAL_LENGTH == 2 + } + + /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and + /// also indicates that the Script value is neither [`Script::Common`] nor + /// [`Script::Inherited`]. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::script::ScriptWithExt; + /// + /// assert!(!ScriptWithExt(0x04FF).is_other()); + /// assert!(!ScriptWithExt(0x0400).is_other()); + /// + /// assert!(!ScriptWithExt(0x08FF).is_other()); + /// assert!(!ScriptWithExt(0x0800).is_other()); + /// + /// assert!(ScriptWithExt(0x0CFF).is_other()); + /// assert!(ScriptWithExt(0x0C00).is_other()); + /// + /// assert!(!ScriptWithExt(0xFF).is_other()); + /// assert!(!ScriptWithExt(0x0).is_other()); + /// ``` + pub fn is_other(&self) -> bool { + self.0 >> SCRIPT_VAL_LENGTH == 3 + } + + /// Returns whether the [`ScriptWithExt`] value has Script_Extensions. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::script::ScriptWithExt; + /// + /// assert!(ScriptWithExt(0x04FF).has_extensions()); + /// assert!(ScriptWithExt(0x0400).has_extensions()); + /// + /// assert!(ScriptWithExt(0x08FF).has_extensions()); + /// assert!(ScriptWithExt(0x0800).has_extensions()); + /// + /// assert!(ScriptWithExt(0x0CFF).has_extensions()); + /// assert!(ScriptWithExt(0x0C00).has_extensions()); + /// + /// assert!(!ScriptWithExt(0xFF).has_extensions()); + /// assert!(!ScriptWithExt(0x0).has_extensions()); + /// ``` + pub fn has_extensions(&self) -> bool { + let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH; + high_order_bits > 0 + } +} + +impl From<ScriptWithExt> for u32 { + fn from(swe: ScriptWithExt) -> Self { + swe.0 as u32 + } +} + +impl From<ScriptWithExt> for Script { + fn from(swe: ScriptWithExt) -> Self { + Script(swe.0) + } +} + +/// A struct that wraps a [`Script`] array, such as in the return value for +/// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val). +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct ScriptExtensionsSet<'a> { + values: &'a ZeroSlice<Script>, +} + +impl ScriptExtensionsSet<'_> { + /// Returns whether this set contains the given script. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{script, Script}; + /// let swe = script::script_with_extensions(); + /// + /// assert!(swe + /// .get_script_extensions_val(0x11303) // GRANTHA SIGN VISARGA + /// .contains(&Script::Grantha)); + /// ``` + pub fn contains(&self, x: &Script) -> bool { + ZeroSlice::binary_search(self.values, x).is_ok() + } + + /// Gets an iterator over the elements. + /// + /// # Example + /// + /// ``` + /// use icu::properties::{script, Script}; + /// let swe = script::script_with_extensions(); + /// + /// assert_eq!( + /// swe.get_script_extensions_val('௫' as u32) // U+0BEB TAMIL DIGIT FIVE + /// .iter() + /// .collect::<Vec<Script>>(), + /// vec![Script::Tamil, Script::Grantha] + /// ); + /// ``` + pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + '_ { + ZeroSlice::iter(self.values) + } + + /// For accessing this set as an array instead of an iterator + /// only needed for the FFI bindings; shouldn't be used directly from Rust + #[doc(hidden)] + pub fn array_len(&self) -> usize { + self.values.len() + } + /// For accessing this set as an array instead of an iterator + /// only needed for the FFI bindings; shouldn't be used directly from Rust + #[doc(hidden)] + pub fn array_get(&self, index: usize) -> Option<Script> { + self.values.get(index) + } +} + +/// A wrapper around script extensions data. Can be obtained via [`load_script_with_extensions_unstable()`] and +/// related getters. +/// +/// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`] +#[derive(Debug)] +pub struct ScriptWithExtensions { + data: DataPayload<ScriptWithExtensionsPropertyV1Marker>, +} + +/// A borrowed wrapper around script extension data, returned by +/// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query. +#[derive(Clone, Copy, Debug)] +pub struct ScriptWithExtensionsBorrowed<'a> { + data: &'a ScriptWithExtensionsPropertyV1<'a>, +} + +impl ScriptWithExtensions { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> { + ScriptWithExtensionsBorrowed { + data: self.data.get(), + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead + pub fn from_data(data: DataPayload<ScriptWithExtensionsPropertyV1Marker>) -> Self { + Self { data } + } +} + +impl<'a> ScriptWithExtensionsBorrowed<'a> { + /// Returns the `Script` property value for this code point. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::{script, Script}; + /// + /// let swe = script::script_with_extensions(); + /// + /// // U+0640 ARABIC TATWEEL + /// assert_eq!(swe.get_script_val(0x0640), Script::Common); // main Script value + /// assert_ne!(swe.get_script_val(0x0640), Script::Arabic); + /// assert_ne!(swe.get_script_val(0x0640), Script::Syriac); + /// assert_ne!(swe.get_script_val(0x0640), Script::Thaana); + /// + /// // U+0650 ARABIC KASRA + /// assert_eq!(swe.get_script_val(0x0650), Script::Inherited); // main Script value + /// assert_ne!(swe.get_script_val(0x0650), Script::Arabic); + /// assert_ne!(swe.get_script_val(0x0650), Script::Syriac); + /// assert_ne!(swe.get_script_val(0x0650), Script::Thaana); + /// + /// // U+0660 ARABIC-INDIC DIGIT ZERO + /// assert_ne!(swe.get_script_val(0x0660), Script::Common); + /// assert_eq!(swe.get_script_val(0x0660), Script::Arabic); // main Script value + /// assert_ne!(swe.get_script_val(0x0660), Script::Syriac); + /// assert_ne!(swe.get_script_val(0x0660), Script::Thaana); + /// + /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM + /// assert_ne!(swe.get_script_val(0xFDF2), Script::Common); + /// assert_eq!(swe.get_script_val(0xFDF2), Script::Arabic); // main Script value + /// assert_ne!(swe.get_script_val(0xFDF2), Script::Syriac); + /// assert_ne!(swe.get_script_val(0xFDF2), Script::Thaana); + /// ``` + pub fn get_script_val(self, code_point: u32) -> Script { + let sc_with_ext = self.data.trie.get32(code_point); + + if sc_with_ext.is_other() { + let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL; + let scx_val = self.data.extensions.get(ext_idx as usize); + let scx_first_sc = scx_val.and_then(|scx| scx.get(0)); + + let default_sc_val = Script::Unknown; + + scx_first_sc.unwrap_or(default_sc_val) + } else if sc_with_ext.is_common() { + Script::Common + } else if sc_with_ext.is_inherited() { + Script::Inherited + } else { + let script_val = sc_with_ext.0; + Script(script_val) + } + } + // Returns the Script_Extensions value for a code_point when the trie value + // is already known. + // This private helper method exists to prevent code duplication in callers like + // `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`. + fn get_scx_val_using_trie_val( + self, + sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE, + ) -> &'a ZeroSlice<Script> { + let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule); + if sc_with_ext.is_other() { + let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL; + let ext_subarray = self.data.extensions.get(ext_idx as usize); + // In the OTHER case, where the 2 higher-order bits of the + // `ScriptWithExt` value in the trie doesn't indicate the Script value, + // the Script value is copied/inserted into the first position of the + // `extensions` array. So we must remove it to return the actual scx array val. + let scx_slice = ext_subarray + .and_then(|zslice| zslice.as_ule_slice().get(1..)) + .unwrap_or_default(); + ZeroSlice::from_ule_slice(scx_slice) + } else if sc_with_ext.is_common() || sc_with_ext.is_inherited() { + let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL; + let scx_val = self.data.extensions.get(ext_idx as usize); + scx_val.unwrap_or_default() + } else { + // Note: `Script` and `ScriptWithExt` are both represented as the same + // u16 value when the `ScriptWithExt` has no higher-order bits set. + let script_ule_slice = core::slice::from_ref(sc_with_ext_ule); + ZeroSlice::from_ule_slice(script_ule_slice) + } + } + /// Return the `Script_Extensions` property value for this code point. + /// + /// If `code_point` has Script_Extensions, then return the Script codes in + /// the Script_Extensions. In this case, the Script property value + /// (normally Common or Inherited) is not included in the [`ScriptExtensionsSet`]. + /// + /// If c does not have Script_Extensions, then the one Script code is put + /// into the [`ScriptExtensionsSet`] and also returned. + /// + /// If c is not a valid code point, then return an empty [`ScriptExtensionsSet`]. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::{script, Script}; + /// + /// let swe = script::script_with_extensions(); + /// + /// assert_eq!( + /// swe.get_script_extensions_val('𐓐' as u32) // U+104D0 OSAGE CAPITAL LETTER KHA + /// .iter() + /// .collect::<Vec<Script>>(), + /// vec![Script::Osage] + /// ); + /// assert_eq!( + /// swe.get_script_extensions_val('🥳' as u32) // U+1F973 FACE WITH PARTY HORN AND PARTY HAT + /// .iter() + /// .collect::<Vec<Script>>(), + /// vec![Script::Common] + /// ); + /// assert_eq!( + /// swe.get_script_extensions_val(0x200D) // ZERO WIDTH JOINER + /// .iter() + /// .collect::<Vec<Script>>(), + /// vec![Script::Inherited] + /// ); + /// assert_eq!( + /// swe.get_script_extensions_val('௫' as u32) // U+0BEB TAMIL DIGIT FIVE + /// .iter() + /// .collect::<Vec<Script>>(), + /// vec![Script::Tamil, Script::Grantha] + /// ); + /// ``` + pub fn get_script_extensions_val(self, code_point: u32) -> ScriptExtensionsSet<'a> { + let sc_with_ext_ule = self.data.trie.get32_ule(code_point); + + ScriptExtensionsSet { + values: match sc_with_ext_ule { + Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref), + None => ZeroSlice::from_ule_slice(&[]), + }, + } + } + + /// Returns whether `script` is contained in the Script_Extensions + /// property value if the code_point has Script_Extensions, otherwise + /// if the code point does not have Script_Extensions then returns + /// whether the Script property value matches. + /// + /// Some characters are commonly used in multiple scripts. For more information, + /// see UAX #24: <http://www.unicode.org/reports/tr24/>. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::{script, Script}; + /// + /// let swe = script::script_with_extensions(); + /// + /// // U+0650 ARABIC KASRA + /// assert!(!swe.has_script(0x0650, Script::Inherited)); // main Script value + /// assert!(swe.has_script(0x0650, Script::Arabic)); + /// assert!(swe.has_script(0x0650, Script::Syriac)); + /// assert!(!swe.has_script(0x0650, Script::Thaana)); + /// + /// // U+0660 ARABIC-INDIC DIGIT ZERO + /// assert!(!swe.has_script(0x0660, Script::Common)); // main Script value + /// assert!(swe.has_script(0x0660, Script::Arabic)); + /// assert!(!swe.has_script(0x0660, Script::Syriac)); + /// assert!(swe.has_script(0x0660, Script::Thaana)); + /// + /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM + /// assert!(!swe.has_script(0xFDF2, Script::Common)); + /// assert!(swe.has_script(0xFDF2, Script::Arabic)); // main Script value + /// assert!(!swe.has_script(0xFDF2, Script::Syriac)); + /// assert!(swe.has_script(0xFDF2, Script::Thaana)); + /// ``` + pub fn has_script(self, code_point: u32, script: Script) -> bool { + let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) { + scwe_ule + } else { + return false; + }; + let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule); + + if !sc_with_ext.has_extensions() { + let script_val = sc_with_ext.0; + script == Script(script_val) + } else { + let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule); + let script_find = scx_val.iter().find(|&sc| sc == script); + script_find.is_some() + } + } + + /// Returns all of the matching `CodePointMapRange`s for the given [`Script`] + /// in which `has_script` will return true for all of the contained code points. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::{script, Script}; + /// + /// let swe = script::script_with_extensions(); + /// + /// let syriac_script_extensions_ranges = swe.get_script_extensions_ranges(Script::Syriac); + /// + /// let exp_ranges = vec![ + /// 0x060C..=0x060C, // ARABIC COMMA + /// 0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK + /// 0x061F..=0x061F, // ARABIC QUESTION MARK + /// 0x0640..=0x0640, // ARABIC TATWEEL + /// 0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW + /// 0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF + /// 0x0700..=0x070D, // Syriac block begins at U+0700 + /// 0x070F..=0x074A, // Syriac block + /// 0x074D..=0x074F, // Syriac block ends at U+074F + /// 0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F + /// 0x1DF8..=0x1DF8, // U+1DF8 COMBINING DOT ABOVE LEFT + /// 0x1DFA..=0x1DFA, // U+1DFA COMBINING DOT BELOW LEFT + /// ]; + /// let mut exp_ranges_iter = exp_ranges.iter(); + /// + /// for act_range in syriac_script_extensions_ranges { + /// let exp_range = exp_ranges_iter + /// .next() + /// .expect("There are too many ranges returned by get_script_extensions_ranges()"); + /// assert_eq!(act_range.start(), exp_range.start()); + /// assert_eq!(act_range.end(), exp_range.end()); + /// } + /// assert!( + /// exp_ranges_iter.next().is_none(), + /// "There are too few ranges returned by get_script_extensions_ranges()" + /// ); + /// ``` + pub fn get_script_extensions_ranges( + self, + script: Script, + ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { + self.data + .trie + .iter_ranges_mapped(move |value| { + let sc_with_ext = ScriptWithExt(value.0); + if sc_with_ext.has_extensions() { + self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned()) + .iter() + .any(|sc| sc == script) + } else { + script == sc_with_ext.into() + } + }) + .filter(|v| v.value) + .map(|v| v.range) + } + + /// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all + /// code points for which `has_script` will return true. + /// + /// # Examples + /// + /// ``` + /// use icu::properties::{script, Script}; + /// + /// let swe = script::script_with_extensions(); + /// + /// let syriac = swe.get_script_extensions_set(Script::Syriac); + /// + /// assert!(!syriac.contains32(0x061E)); // ARABIC TRIPLE DOT PUNCTUATION MARK + /// assert!(syriac.contains32(0x061F)); // ARABIC QUESTION MARK + /// assert!(!syriac.contains32(0x0620)); // ARABIC LETTER KASHMIRI YEH + /// + /// assert!(syriac.contains32(0x0700)); // SYRIAC END OF PARAGRAPH + /// assert!(syriac.contains32(0x074A)); // SYRIAC BARREKH + /// assert!(!syriac.contains32(0x074B)); // unassigned + /// assert!(syriac.contains32(0x074F)); // SYRIAC LETTER SOGDIAN FE + /// assert!(!syriac.contains32(0x0750)); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW + /// + /// assert!(syriac.contains32(0x1DF8)); // COMBINING DOT ABOVE LEFT + /// assert!(!syriac.contains32(0x1DF9)); // COMBINING WIDE INVERTED BRIDGE BELOW + /// assert!(syriac.contains32(0x1DFA)); // COMBINING DOT BELOW LEFT + /// assert!(!syriac.contains32(0x1DFB)); // COMBINING DELETION MARK + /// ``` + pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> { + CodePointInversionList::from_iter(self.get_script_extensions_ranges(script)) + } +} + +impl ScriptWithExtensionsBorrowed<'static> { + /// Cheaply converts a `ScriptWithExtensionsBorrowed<'static>` into a `ScriptWithExtensions`. + pub const fn static_to_owned(self) -> ScriptWithExtensions { + ScriptWithExtensions { + data: DataPayload::from_static_ref(self.data), + } + } +} + +/// Returns a [`ScriptWithExtensionsBorrowed`] struct that represents the data for the Script +/// and Script_Extensions properties. +/// +/// ✨ *Enabled with the `compiled_data` Cargo feature.* +/// +/// [📚 Help choosing a constructor](icu_provider::constructors) +/// +/// # Examples +/// +/// ``` +/// use icu::properties::{script, Script}; +/// let swe = script::script_with_extensions(); +/// +/// // get the `Script` property value +/// assert_eq!(swe.get_script_val(0x0640), Script::Common); // U+0640 ARABIC TATWEEL +/// assert_eq!(swe.get_script_val(0x0650), Script::Inherited); // U+0650 ARABIC KASRA +/// assert_eq!(swe.get_script_val(0x0660), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO +/// assert_eq!(swe.get_script_val(0xFDF2), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM +/// +/// // get the `Script_Extensions` property value +/// assert_eq!( +/// swe.get_script_extensions_val(0x0640) // U+0640 ARABIC TATWEEL +/// .iter().collect::<Vec<Script>>(), +/// vec![Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean, +/// Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian, +/// Script::OldUyghur] +/// ); +/// assert_eq!( +/// swe.get_script_extensions_val('🥳' as u32) // U+1F973 FACE WITH PARTY HORN AND PARTY HAT +/// .iter().collect::<Vec<Script>>(), +/// vec![Script::Common] +/// ); +/// assert_eq!( +/// swe.get_script_extensions_val(0x200D) // ZERO WIDTH JOINER +/// .iter().collect::<Vec<Script>>(), +/// vec![Script::Inherited] +/// ); +/// assert_eq!( +/// swe.get_script_extensions_val('௫' as u32) // U+0BEB TAMIL DIGIT FIVE +/// .iter().collect::<Vec<Script>>(), +/// vec![Script::Tamil, Script::Grantha] +/// ); +/// +/// // check containment of a `Script` value in the `Script_Extensions` value +/// // U+0650 ARABIC KASRA +/// assert!(!swe.has_script(0x0650, Script::Inherited)); // main Script value +/// assert!(swe.has_script(0x0650, Script::Arabic)); +/// assert!(swe.has_script(0x0650, Script::Syriac)); +/// assert!(!swe.has_script(0x0650, Script::Thaana)); +/// +/// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value +/// let syriac = swe.get_script_extensions_set(Script::Syriac); +/// assert!(syriac.contains32(0x0650)); // ARABIC KASRA +/// assert!(!syriac.contains32(0x0660)); // ARABIC-INDIC DIGIT ZERO +/// assert!(!syriac.contains32(0xFDF2)); // ARABIC LIGATURE ALLAH ISOLATED FORM +/// assert!(syriac.contains32(0x0700)); // SYRIAC END OF PARAGRAPH +/// assert!(syriac.contains32(0x074A)); // SYRIAC BARREKH +/// ``` +#[cfg(feature = "compiled_data")] +pub const fn script_with_extensions() -> ScriptWithExtensionsBorrowed<'static> { + ScriptWithExtensionsBorrowed { + data: crate::provider::Baked::SINGLETON_PROPS_SCX_V1, + } +} + +icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + options: skip, + result: Result<ScriptWithExtensions, PropertiesError>, + #[cfg(skip)] + functions: [ + script_with_extensions, + load_script_with_extensions_with_any_provider, + load_script_with_extensions_with_buffer_provider, + load_script_with_extensions_unstable, + ] +); + +#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, script_with_extensions)] +pub fn load_script_with_extensions_unstable( + provider: &(impl DataProvider<ScriptWithExtensionsPropertyV1Marker> + ?Sized), +) -> Result<ScriptWithExtensions, PropertiesError> { + Ok(ScriptWithExtensions::from_data( + provider + .load(Default::default()) + .and_then(DataResponse::take_payload)?, + )) +} diff --git a/third_party/rust/icu_properties/src/sets.rs b/third_party/rust/icu_properties/src/sets.rs new file mode 100644 index 0000000000..3fd229f72c --- /dev/null +++ b/third_party/rust/icu_properties/src/sets.rs @@ -0,0 +1,2381 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! The functions in this module return a [`CodePointSetData`] containing +//! the set of characters with a particular Unicode property. +//! +//! The descriptions of most properties are taken from [`TR44`], the documentation for the +//! Unicode Character Database. Some properties are instead defined in [`TR18`], the +//! documentation for Unicode regular expressions. In particular, Annex C of this document +//! defines properties for POSIX compatibility. +//! +//! [`CodePointSetData`]: crate::sets::CodePointSetData +//! [`TR44`]: https://www.unicode.org/reports/tr44 +//! [`TR18`]: https://www.unicode.org/reports/tr18 + +use crate::error::PropertiesError; +use crate::provider::*; +use crate::*; +use core::iter::FromIterator; +use core::ops::RangeInclusive; +use icu_collections::codepointinvlist::CodePointInversionList; +use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList; +use icu_provider::prelude::*; + +// +// CodePointSet* structs, impls, & macros +// (a set with only code points) +// + +/// A wrapper around code point set data. It is returned by APIs that return Unicode +/// property data in a set-like form, ex: a set of code points sharing the same +/// value for a Unicode property. Access its data via the borrowed version, +/// [`CodePointSetDataBorrowed`]. +#[derive(Debug)] +pub struct CodePointSetData { + data: DataPayload<ErasedSetlikeMarker>, +} + +/// Private marker type for CodePointSetData +/// to work for all set properties at once +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) struct ErasedSetlikeMarker; +impl DataMarker for ErasedSetlikeMarker { + type Yokeable = PropertyCodePointSetV1<'static>; +} + +impl CodePointSetData { + /// Construct a borrowed version of this type that can be queried. + /// + /// This owned version if returned by functions that use a runtime data provider. + #[inline] + pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> { + CodePointSetDataBorrowed { + set: self.data.get(), + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead + pub fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyCodePointSetV1<'static>>, + { + Self { data: data.cast() } + } + + /// Construct a new owned [`CodePointInversionList`] + pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self { + let set = PropertyCodePointSetV1::from_code_point_inversion_list(set); + CodePointSetData::from_data(DataPayload::<ErasedSetlikeMarker>::from_owned(set)) + } + + /// Convert this type to a [`CodePointInversionList`] as a borrowed value. + /// + /// The data backing this is extensible and supports multiple implementations. + /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be + /// added, and users may select which at data generation time. + /// + /// This method returns an `Option` in order to return `None` when the backing data provider + /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time + /// constraint. + pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> { + self.data.get().as_code_point_inversion_list() + } + + /// Convert this type to a [`CodePointInversionList`], borrowing if possible, + /// otherwise allocating a new [`CodePointInversionList`]. + /// + /// The data backing this is extensible and supports multiple implementations. + /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be + /// added, and users may select which at data generation time. + /// + /// The performance of the conversion to this specific return type will vary + /// depending on the data structure that is backing `self`. + pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> { + self.data.get().to_code_point_inversion_list() + } +} + +/// A borrowed wrapper around code point set data, returned by +/// [`CodePointSetData::as_borrowed()`]. More efficient to query. +#[derive(Clone, Copy, Debug)] +pub struct CodePointSetDataBorrowed<'a> { + set: &'a PropertyCodePointSetV1<'a>, +} + +impl CodePointSetDataBorrowed<'static> { + /// Cheaply converts a `CodePointSetDataBorrowed<'static>` into a `CodePointSetData`. + pub const fn static_to_owned(self) -> CodePointSetData { + CodePointSetData { + data: DataPayload::from_static_ref(self.set), + } + } +} + +impl<'a> CodePointSetDataBorrowed<'a> { + /// Check if the set contains a character + /// + /// ```rust + /// use icu_properties::sets; + /// + /// let alphabetic = sets::alphabetic(); + /// + /// assert!(!alphabetic.contains('3')); + /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE + /// assert!(alphabetic.contains('A')); + /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS + /// ``` + #[inline] + pub fn contains(self, ch: char) -> bool { + self.set.contains(ch) + } + + /// Check if the set contains a character as a UTF32 code unit + /// + /// ```rust + /// use icu_properties::sets; + /// + /// let alphabetic = sets::alphabetic(); + /// + /// assert!(!alphabetic.contains32(0x0A69)); // U+0A69 GURMUKHI DIGIT THREE + /// assert!(alphabetic.contains32(0x00C4)); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS + /// ``` + #[inline] + pub fn contains32(self, ch: u32) -> bool { + self.set.contains32(ch) + } + + // Yields an [`Iterator`] returning the ranges of the code points that are + /// included in the [`CodePointSetData`] + /// + /// Ranges are returned as [`RangeInclusive`], which is inclusive of its + /// `end` bound value. An end-inclusive behavior matches the ICU4C/J + /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`. + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let alphabetic = sets::alphabetic(); + /// let mut ranges = alphabetic.iter_ranges(); + /// + /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z' + /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z' + /// ``` + #[inline] + pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { + self.set.iter_ranges() + } + + // Yields an [`Iterator`] returning the ranges of the code points that are + /// *not* included in the [`CodePointSetData`] + /// + /// Ranges are returned as [`RangeInclusive`], which is inclusive of its + /// `end` bound value. An end-inclusive behavior matches the ICU4C/J + /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`. + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let alphabetic = sets::alphabetic(); + /// let mut ranges = alphabetic.iter_ranges(); + /// + /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z' + /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z' + /// ``` + #[inline] + pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a { + self.set.iter_ranges_complemented() + } +} + +// +// UnicodeSet* structs, impls, & macros +// (a set with code points + strings) +// + +/// A wrapper around `UnicodeSet` data (characters and strings) +#[derive(Debug)] +pub struct UnicodeSetData { + data: DataPayload<ErasedUnicodeSetlikeMarker>, +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) struct ErasedUnicodeSetlikeMarker; +impl DataMarker for ErasedUnicodeSetlikeMarker { + type Yokeable = PropertyUnicodeSetV1<'static>; +} + +impl UnicodeSetData { + /// Construct a borrowed version of this type that can be queried. + /// + /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it + /// up front. + #[inline] + pub fn as_borrowed(&self) -> UnicodeSetDataBorrowed<'_> { + UnicodeSetDataBorrowed { + set: self.data.get(), + } + } + + /// Construct a new one from loaded data + /// + /// Typically it is preferable to use getters instead + pub fn from_data<M>(data: DataPayload<M>) -> Self + where + M: DataMarker<Yokeable = PropertyUnicodeSetV1<'static>>, + { + Self { data: data.cast() } + } + + /// Construct a new owned [`CodePointInversionListAndStringList`] + pub fn from_code_point_inversion_list_string_list( + set: CodePointInversionListAndStringList<'static>, + ) -> Self { + let set = PropertyUnicodeSetV1::from_code_point_inversion_list_string_list(set); + UnicodeSetData::from_data(DataPayload::<ErasedUnicodeSetlikeMarker>::from_owned(set)) + } + + /// Convert this type to a [`CodePointInversionListAndStringList`] as a borrowed value. + /// + /// The data backing this is extensible and supports multiple implementations. + /// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be + /// added, and users may select which at data generation time. + /// + /// This method returns an `Option` in order to return `None` when the backing data provider + /// cannot return a [`CodePointInversionListAndStringList`], or cannot do so within the expected constant time + /// constraint. + pub fn as_code_point_inversion_list_string_list( + &self, + ) -> Option<&CodePointInversionListAndStringList<'_>> { + self.data.get().as_code_point_inversion_list_string_list() + } + + /// Convert this type to a [`CodePointInversionListAndStringList`], borrowing if possible, + /// otherwise allocating a new [`CodePointInversionListAndStringList`]. + /// + /// The data backing this is extensible and supports multiple implementations. + /// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be + /// added, and users may select which at data generation time. + /// + /// The performance of the conversion to this specific return type will vary + /// depending on the data structure that is backing `self`. + pub fn to_code_point_inversion_list_string_list( + &self, + ) -> CodePointInversionListAndStringList<'_> { + self.data.get().to_code_point_inversion_list_string_list() + } +} + +/// A borrowed wrapper around code point set data, returned by +/// [`UnicodeSetData::as_borrowed()`]. More efficient to query. +#[derive(Clone, Copy, Debug)] +pub struct UnicodeSetDataBorrowed<'a> { + set: &'a PropertyUnicodeSetV1<'a>, +} + +impl<'a> UnicodeSetDataBorrowed<'a> { + /// Check if the set contains the string. Strings consisting of one character + /// are treated as a character/code point. + /// + /// This matches ICU behavior for ICU's `UnicodeSet`. + #[inline] + pub fn contains(self, s: &str) -> bool { + self.set.contains(s) + } + + /// Check if the set contains a character as a UTF32 code unit + #[inline] + pub fn contains32(&self, cp: u32) -> bool { + self.set.contains32(cp) + } + + /// Check if the set contains the code point corresponding to the Rust character. + #[inline] + pub fn contains_char(&self, ch: char) -> bool { + self.set.contains_char(ch) + } +} + +impl UnicodeSetDataBorrowed<'static> { + /// Cheaply converts a `UnicodeSetDataBorrowed<'static>` into a `UnicodeSetData`. + pub const fn static_to_owned(self) -> UnicodeSetData { + UnicodeSetData { + data: DataPayload::from_static_ref(self.set), + } + } +} + +pub(crate) fn load_set_data<M, P>(provider: &P) -> Result<CodePointSetData, PropertiesError> +where + M: KeyedDataMarker<Yokeable = PropertyCodePointSetV1<'static>>, + P: DataProvider<M> + ?Sized, +{ + Ok(provider + .load(Default::default()) + .and_then(DataResponse::take_payload) + .map(CodePointSetData::from_data)?) +} + +// +// Binary property getter fns +// (data as code point sets) +// + +macro_rules! make_code_point_set_property { + ( + // currently unused + property: $property:expr; + // currently unused + marker: $marker_name:ident; + keyed_data_marker: $keyed_data_marker:ty; + func: + $(#[$doc:meta])+ + $cvis:vis const fn $constname:ident() => $singleton_name:ident; + $vis:vis fn $funcname:ident(); + ) => { + #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")] + /// + /// Note that this will return an owned version of the data. Functionality is available on + /// the borrowed version, accessible through [`CodePointSetData::as_borrowed`]. + $vis fn $funcname( + provider: &(impl DataProvider<$keyed_data_marker> + ?Sized) + ) -> Result<CodePointSetData, PropertiesError> { + load_set_data(provider) + } + + $(#[$doc])* + #[cfg(feature = "compiled_data")] + $cvis const fn $constname() -> CodePointSetDataBorrowed<'static> { + CodePointSetDataBorrowed { + set: crate::provider::Baked::$singleton_name, + } + } + } +} + +make_code_point_set_property! { + property: "ASCII_Hex_Digit"; + marker: AsciiHexDigitProperty; + keyed_data_marker: AsciiHexDigitV1Marker; + func: + /// ASCII characters commonly used for the representation of hexadecimal numbers + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let ascii_hex_digit = sets::ascii_hex_digit(); + /// + /// assert!(ascii_hex_digit.contains('3')); + /// assert!(!ascii_hex_digit.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE + /// assert!(ascii_hex_digit.contains('A')); + /// assert!(!ascii_hex_digit.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS + /// ``` + pub const fn ascii_hex_digit() => SINGLETON_PROPS_AHEX_V1; + pub fn load_ascii_hex_digit(); +} + +make_code_point_set_property! { + property: "Alnum"; + marker: AlnumProperty; + keyed_data_marker: AlnumV1Marker; + func: + /// Characters with the Alphabetic or Decimal_Number property + /// This is defined for POSIX compatibility. + + pub const fn alnum() => SINGLETON_PROPS_ALNUM_V1; + pub fn load_alnum(); +} + +make_code_point_set_property! { + property: "Alphabetic"; + marker: AlphabeticProperty; + keyed_data_marker: AlphabeticV1Marker; + func: + /// Alphabetic characters + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let alphabetic = sets::alphabetic(); + /// + /// assert!(!alphabetic.contains('3')); + /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE + /// assert!(alphabetic.contains('A')); + /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS + /// ``` + + pub const fn alphabetic() => SINGLETON_PROPS_ALPHA_V1; + pub fn load_alphabetic(); +} + +make_code_point_set_property! { + property: "Bidi_Control"; + marker: BidiControlProperty; + keyed_data_marker: BidiControlV1Marker; + func: + /// Format control characters which have specific functions in the Unicode Bidirectional + /// Algorithm + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let bidi_control = sets::bidi_control(); + /// + /// assert!(bidi_control.contains32(0x200F)); // RIGHT-TO-LEFT MARK + /// assert!(!bidi_control.contains('ش')); // U+0634 ARABIC LETTER SHEEN + /// ``` + + pub const fn bidi_control() => SINGLETON_PROPS_BIDI_C_V1; + pub fn load_bidi_control(); +} + +make_code_point_set_property! { + property: "Bidi_Mirrored"; + marker: BidiMirroredProperty; + keyed_data_marker: BidiMirroredV1Marker; + func: + /// Characters that are mirrored in bidirectional text + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let bidi_mirrored = sets::bidi_mirrored(); + /// + /// assert!(bidi_mirrored.contains('[')); + /// assert!(bidi_mirrored.contains(']')); + /// assert!(bidi_mirrored.contains('∑')); // U+2211 N-ARY SUMMATION + /// assert!(!bidi_mirrored.contains('ཉ')); // U+0F49 TIBETAN LETTER NYA + /// ``` + + pub const fn bidi_mirrored() => SINGLETON_PROPS_BIDI_M_V1; + pub fn load_bidi_mirrored(); +} + +make_code_point_set_property! { + property: "Blank"; + marker: BlankProperty; + keyed_data_marker: BlankV1Marker; + func: + /// Horizontal whitespace characters + + pub const fn blank() => SINGLETON_PROPS_BLANK_V1; + pub fn load_blank(); +} + +make_code_point_set_property! { + property: "Cased"; + marker: CasedProperty; + keyed_data_marker: CasedV1Marker; + func: + /// Uppercase, lowercase, and titlecase characters + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let cased = sets::cased(); + /// + /// assert!(cased.contains('Ꙡ')); // U+A660 CYRILLIC CAPITAL LETTER REVERSED TSE + /// assert!(!cased.contains('ދ')); // U+078B THAANA LETTER DHAALU + /// ``` + + pub const fn cased() => SINGLETON_PROPS_CASED_V1; + pub fn load_cased(); +} + +make_code_point_set_property! { + property: "Case_Ignorable"; + marker: CaseIgnorableProperty; + keyed_data_marker: CaseIgnorableV1Marker; + func: + /// Characters which are ignored for casing purposes + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let case_ignorable = sets::case_ignorable(); + /// + /// assert!(case_ignorable.contains(':')); + /// assert!(!case_ignorable.contains('λ')); // U+03BB GREEK SMALL LETTER LAMDA + /// ``` + + pub const fn case_ignorable() => SINGLETON_PROPS_CI_V1; + pub fn load_case_ignorable(); +} + +make_code_point_set_property! { + property: "Full_Composition_Exclusion"; + marker: FullCompositionExclusionProperty; + keyed_data_marker: FullCompositionExclusionV1Marker; + func: + /// Characters that are excluded from composition + /// See <https://unicode.org/Public/UNIDATA/CompositionExclusions.txt> + + pub const fn full_composition_exclusion() => SINGLETON_PROPS_COMP_EX_V1; + pub fn load_full_composition_exclusion(); +} + +make_code_point_set_property! { + property: "Changes_When_Casefolded"; + marker: ChangesWhenCasefoldedProperty; + keyed_data_marker: ChangesWhenCasefoldedV1Marker; + func: + /// Characters whose normalized forms are not stable under case folding + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let changes_when_casefolded = sets::changes_when_casefolded(); + /// + /// assert!(changes_when_casefolded.contains('ß')); // U+00DF LATIN SMALL LETTER SHARP S + /// assert!(!changes_when_casefolded.contains('ᜉ')); // U+1709 TAGALOG LETTER PA + /// ``` + + pub const fn changes_when_casefolded() => SINGLETON_PROPS_CWCF_V1; + pub fn load_changes_when_casefolded(); +} + +make_code_point_set_property! { + property: "Changes_When_Casemapped"; + marker: ChangesWhenCasemappedProperty; + keyed_data_marker: ChangesWhenCasemappedV1Marker; + func: + /// Characters which may change when they undergo case mapping + + pub const fn changes_when_casemapped() => SINGLETON_PROPS_CWCM_V1; + pub fn load_changes_when_casemapped(); +} + +make_code_point_set_property! { + property: "Changes_When_NFKC_Casefolded"; + marker: ChangesWhenNfkcCasefoldedProperty; + keyed_data_marker: ChangesWhenNfkcCasefoldedV1Marker; + func: + /// Characters which are not identical to their NFKC_Casefold mapping + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let changes_when_nfkc_casefolded = sets::changes_when_nfkc_casefolded(); + /// + /// assert!(changes_when_nfkc_casefolded.contains('🄵')); // U+1F135 SQUARED LATIN CAPITAL LETTER F + /// assert!(!changes_when_nfkc_casefolded.contains('f')); + /// ``` + + pub const fn changes_when_nfkc_casefolded() => SINGLETON_PROPS_CWKCF_V1; + pub fn load_changes_when_nfkc_casefolded(); +} + +make_code_point_set_property! { + property: "Changes_When_Lowercased"; + marker: ChangesWhenLowercasedProperty; + keyed_data_marker: ChangesWhenLowercasedV1Marker; + func: + /// Characters whose normalized forms are not stable under a toLowercase mapping + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let changes_when_lowercased = sets::changes_when_lowercased(); + /// + /// assert!(changes_when_lowercased.contains('Ⴔ')); // U+10B4 GEORGIAN CAPITAL LETTER PHAR + /// assert!(!changes_when_lowercased.contains('ფ')); // U+10E4 GEORGIAN LETTER PHAR + /// ``` + + pub const fn changes_when_lowercased() => SINGLETON_PROPS_CWL_V1; + pub fn load_changes_when_lowercased(); +} + +make_code_point_set_property! { + property: "Changes_When_Titlecased"; + marker: ChangesWhenTitlecasedProperty; + keyed_data_marker: ChangesWhenTitlecasedV1Marker; + func: + /// Characters whose normalized forms are not stable under a toTitlecase mapping + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let changes_when_titlecased = sets::changes_when_titlecased(); + /// + /// assert!(changes_when_titlecased.contains('æ')); // U+00E6 LATIN SMALL LETTER AE + /// assert!(!changes_when_titlecased.contains('Æ')); // U+00E6 LATIN CAPITAL LETTER AE + /// ``` + + pub const fn changes_when_titlecased() => SINGLETON_PROPS_CWT_V1; + pub fn load_changes_when_titlecased(); +} + +make_code_point_set_property! { + property: "Changes_When_Uppercased"; + marker: ChangesWhenUppercasedProperty; + keyed_data_marker: ChangesWhenUppercasedV1Marker; + func: + /// Characters whose normalized forms are not stable under a toUppercase mapping + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let changes_when_uppercased = sets::changes_when_uppercased(); + /// + /// assert!(changes_when_uppercased.contains('ւ')); // U+0582 ARMENIAN SMALL LETTER YIWN + /// assert!(!changes_when_uppercased.contains('Ւ')); // U+0552 ARMENIAN CAPITAL LETTER YIWN + /// ``` + + pub const fn changes_when_uppercased() => SINGLETON_PROPS_CWU_V1; + pub fn load_changes_when_uppercased(); +} + +make_code_point_set_property! { + property: "Dash"; + marker: DashProperty; + keyed_data_marker: DashV1Marker; + func: + /// Punctuation characters explicitly called out as dashes in the Unicode Standard, plus + /// their compatibility equivalents + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let dash = sets::dash(); + /// + /// assert!(dash.contains('⸺')); // U+2E3A TWO-EM DASH + /// assert!(dash.contains('-')); // U+002D + /// assert!(!dash.contains('=')); // U+003D + /// ``` + + pub const fn dash() => SINGLETON_PROPS_DASH_V1; + pub fn load_dash(); +} + +make_code_point_set_property! { + property: "Deprecated"; + marker: DeprecatedProperty; + keyed_data_marker: DeprecatedV1Marker; + func: + /// Deprecated characters. No characters will ever be removed from the standard, but the + /// usage of deprecated characters is strongly discouraged. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let deprecated = sets::deprecated(); + /// + /// assert!(deprecated.contains('ឣ')); // U+17A3 KHMER INDEPENDENT VOWEL QAQ + /// assert!(!deprecated.contains('A')); + /// ``` + + pub const fn deprecated() => SINGLETON_PROPS_DEP_V1; + pub fn load_deprecated(); +} + +make_code_point_set_property! { + property: "Default_Ignorable_Code_Point"; + marker: DefaultIgnorableCodePointProperty; + keyed_data_marker: DefaultIgnorableCodePointV1Marker; + func: + /// For programmatic determination of default ignorable code points. New characters that + /// should be ignored in rendering (unless explicitly supported) will be assigned in these + /// ranges, permitting programs to correctly handle the default rendering of such + /// characters when not otherwise supported. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let default_ignorable_code_point = sets::default_ignorable_code_point(); + /// + /// assert!(default_ignorable_code_point.contains32(0x180B)); // MONGOLIAN FREE VARIATION SELECTOR ONE + /// assert!(!default_ignorable_code_point.contains('E')); + /// ``` + + pub const fn default_ignorable_code_point() => SINGLETON_PROPS_DI_V1; + pub fn load_default_ignorable_code_point(); +} + +make_code_point_set_property! { + property: "Diacritic"; + marker: DiacriticProperty; + keyed_data_marker: DiacriticV1Marker; + func: + /// Characters that linguistically modify the meaning of another character to which they apply + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let diacritic = sets::diacritic(); + /// + /// assert!(diacritic.contains('\u{05B3}')); // HEBREW POINT HATAF QAMATS + /// assert!(!diacritic.contains('א')); // U+05D0 HEBREW LETTER ALEF + /// ``` + + pub const fn diacritic() => SINGLETON_PROPS_DIA_V1; + pub fn load_diacritic(); +} + +make_code_point_set_property! { + property: "Emoji_Modifier_Base"; + marker: EmojiModifierBaseProperty; + keyed_data_marker: EmojiModifierBaseV1Marker; + func: + /// Characters that can serve as a base for emoji modifiers + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let emoji_modifier_base = sets::emoji_modifier_base(); + /// + /// assert!(emoji_modifier_base.contains('✊')); // U+270A RAISED FIST + /// assert!(!emoji_modifier_base.contains('⛰')); // U+26F0 MOUNTAIN + /// ``` + + pub const fn emoji_modifier_base() => SINGLETON_PROPS_EBASE_V1; + pub fn load_emoji_modifier_base(); +} + +make_code_point_set_property! { + property: "Emoji_Component"; + marker: EmojiComponentProperty; + keyed_data_marker: EmojiComponentV1Marker; + func: + /// Characters used in emoji sequences that normally do not appear on emoji keyboards as + /// separate choices, such as base characters for emoji keycaps + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let emoji_component = sets::emoji_component(); + /// + /// assert!(emoji_component.contains('🇹')); // U+1F1F9 REGIONAL INDICATOR SYMBOL LETTER T + /// assert!(emoji_component.contains32(0x20E3)); // COMBINING ENCLOSING KEYCAP + /// assert!(emoji_component.contains('7')); + /// assert!(!emoji_component.contains('T')); + /// ``` + + pub const fn emoji_component() => SINGLETON_PROPS_ECOMP_V1; + pub fn load_emoji_component(); +} + +make_code_point_set_property! { + property: "Emoji_Modifier"; + marker: EmojiModifierProperty; + keyed_data_marker: EmojiModifierV1Marker; + func: + /// Characters that are emoji modifiers + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let emoji_modifier = sets::emoji_modifier(); + /// + /// assert!(emoji_modifier.contains32(0x1F3FD)); // EMOJI MODIFIER FITZPATRICK TYPE-4 + /// assert!(!emoji_modifier.contains32(0x200C)); // ZERO WIDTH NON-JOINER + /// ``` + + pub const fn emoji_modifier() => SINGLETON_PROPS_EMOD_V1; + pub fn load_emoji_modifier(); +} + +make_code_point_set_property! { + property: "Emoji"; + marker: EmojiProperty; + keyed_data_marker: EmojiV1Marker; + func: + /// Characters that are emoji + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let emoji = sets::emoji(); + /// + /// assert!(emoji.contains('🔥')); // U+1F525 FIRE + /// assert!(!emoji.contains('V')); + /// ``` + + pub const fn emoji() => SINGLETON_PROPS_EMOJI_V1; + pub fn load_emoji(); +} + +make_code_point_set_property! { + property: "Emoji_Presentation"; + marker: EmojiPresentationProperty; + keyed_data_marker: EmojiPresentationV1Marker; + func: + /// Characters that have emoji presentation by default + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let emoji_presentation = sets::emoji_presentation(); + /// + /// assert!(emoji_presentation.contains('🦬')); // U+1F9AC BISON + /// assert!(!emoji_presentation.contains('♻')); // U+267B BLACK UNIVERSAL RECYCLING SYMBOL + /// ``` + + pub const fn emoji_presentation() => SINGLETON_PROPS_EPRES_V1; + pub fn load_emoji_presentation(); +} + +make_code_point_set_property! { + property: "Extender"; + marker: ExtenderProperty; + keyed_data_marker: ExtenderV1Marker; + func: + /// Characters whose principal function is to extend the value of a preceding alphabetic + /// character or to extend the shape of adjacent characters. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let extender = sets::extender(); + /// + /// assert!(extender.contains('ヾ')); // U+30FE KATAKANA VOICED ITERATION MARK + /// assert!(extender.contains('ー')); // U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK + /// assert!(!extender.contains('・')); // U+30FB KATAKANA MIDDLE DOT + /// ``` + + pub const fn extender() => SINGLETON_PROPS_EXT_V1; + pub fn load_extender(); +} + +make_code_point_set_property! { + property: "Extended_Pictographic"; + marker: ExtendedPictographicProperty; + keyed_data_marker: ExtendedPictographicV1Marker; + func: + /// Pictographic symbols, as well as reserved ranges in blocks largely associated with + /// emoji characters + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let extended_pictographic = sets::extended_pictographic(); + /// + /// assert!(extended_pictographic.contains('🥳')); // U+1F973 FACE WITH PARTY HORN AND PARTY HAT + /// assert!(!extended_pictographic.contains('🇪')); // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E + /// ``` + + pub const fn extended_pictographic() => SINGLETON_PROPS_EXTPICT_V1; + pub fn load_extended_pictographic(); +} + +make_code_point_set_property! { + property: "Graph"; + marker: GraphProperty; + keyed_data_marker: GraphV1Marker; + func: + /// Visible characters. + /// This is defined for POSIX compatibility. + + pub const fn graph() => SINGLETON_PROPS_GRAPH_V1; + pub fn load_graph(); +} + +make_code_point_set_property! { + property: "Grapheme_Base"; + marker: GraphemeBaseProperty; + keyed_data_marker: GraphemeBaseV1Marker; + func: + /// Property used together with the definition of Standard Korean Syllable Block to define + /// "Grapheme base". See D58 in Chapter 3, Conformance in the Unicode Standard. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let grapheme_base = sets::grapheme_base(); + /// + /// assert!(grapheme_base.contains('ക')); // U+0D15 MALAYALAM LETTER KA + /// assert!(grapheme_base.contains('\u{0D3F}')); // U+0D3F MALAYALAM VOWEL SIGN I + /// assert!(!grapheme_base.contains('\u{0D3E}')); // U+0D3E MALAYALAM VOWEL SIGN AA + /// ``` + + pub const fn grapheme_base() => SINGLETON_PROPS_GR_BASE_V1; + pub fn load_grapheme_base(); +} + +make_code_point_set_property! { + property: "Grapheme_Extend"; + marker: GraphemeExtendProperty; + keyed_data_marker: GraphemeExtendV1Marker; + func: + /// Property used to define "Grapheme extender". See D59 in Chapter 3, Conformance in the + /// Unicode Standard. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let grapheme_extend = sets::grapheme_extend(); + /// + /// assert!(!grapheme_extend.contains('ക')); // U+0D15 MALAYALAM LETTER KA + /// assert!(!grapheme_extend.contains('\u{0D3F}')); // U+0D3F MALAYALAM VOWEL SIGN I + /// assert!(grapheme_extend.contains('\u{0D3E}')); // U+0D3E MALAYALAM VOWEL SIGN AA + /// ``` + + pub const fn grapheme_extend() => SINGLETON_PROPS_GR_EXT_V1; + pub fn load_grapheme_extend(); +} + +make_code_point_set_property! { + property: "Grapheme_Link"; + marker: GraphemeLinkProperty; + keyed_data_marker: GraphemeLinkV1Marker; + func: + /// Deprecated property. Formerly proposed for programmatic determination of grapheme + /// cluster boundaries. + + pub const fn grapheme_link() => SINGLETON_PROPS_GR_LINK_V1; + pub fn load_grapheme_link(); +} + +make_code_point_set_property! { + property: "Hex_Digit"; + marker: HexDigitProperty; + keyed_data_marker: HexDigitV1Marker; + func: + /// Characters commonly used for the representation of hexadecimal numbers, plus their + /// compatibility equivalents + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let hex_digit = sets::hex_digit(); + /// + /// assert!(hex_digit.contains('0')); + /// assert!(!hex_digit.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE + /// assert!(hex_digit.contains('f')); + /// assert!(hex_digit.contains('f')); // U+FF46 FULLWIDTH LATIN SMALL LETTER F + /// assert!(hex_digit.contains('F')); // U+FF26 FULLWIDTH LATIN CAPITAL LETTER F + /// assert!(!hex_digit.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS + /// ``` + + pub const fn hex_digit() => SINGLETON_PROPS_HEX_V1; + pub fn load_hex_digit(); +} + +make_code_point_set_property! { + property: "Hyphen"; + marker: HyphenProperty; + keyed_data_marker: HyphenV1Marker; + func: + /// Deprecated property. Dashes which are used to mark connections between pieces of + /// words, plus the Katakana middle dot. + + pub const fn hyphen() => SINGLETON_PROPS_HYPHEN_V1; + pub fn load_hyphen(); +} + +make_code_point_set_property! { + property: "Id_Continue"; + marker: IdContinueProperty; + keyed_data_marker: IdContinueV1Marker; + func: + /// Characters that can come after the first character in an identifier. If using NFKC to + /// fold differences between characters, use [`load_xid_continue`] instead. See + /// [`Unicode Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for + /// more details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let id_continue = sets::id_continue(); + /// + /// assert!(id_continue.contains('x')); + /// assert!(id_continue.contains('1')); + /// assert!(id_continue.contains('_')); + /// assert!(id_continue.contains('ߝ')); // U+07DD NKO LETTER FA + /// assert!(!id_continue.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X + /// assert!(id_continue.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + /// ``` + + pub const fn id_continue() => SINGLETON_PROPS_IDC_V1; + pub fn load_id_continue(); +} + +make_code_point_set_property! { + property: "Ideographic"; + marker: IdeographicProperty; + keyed_data_marker: IdeographicV1Marker; + func: + /// Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) + /// ideographs, or related siniform ideographs + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let ideographic = sets::ideographic(); + /// + /// assert!(ideographic.contains('川')); // U+5DDD CJK UNIFIED IDEOGRAPH-5DDD + /// assert!(!ideographic.contains('밥')); // U+BC25 HANGUL SYLLABLE BAB + /// ``` + + pub const fn ideographic() => SINGLETON_PROPS_IDEO_V1; + pub fn load_ideographic(); +} + +make_code_point_set_property! { + property: "Id_Start"; + marker: IdStartProperty; + keyed_data_marker: IdStartV1Marker; + func: + /// Characters that can begin an identifier. If using NFKC to fold differences between + /// characters, use [`load_xid_start`] instead. See [`Unicode Standard Annex + /// #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let id_start = sets::id_start(); + /// + /// assert!(id_start.contains('x')); + /// assert!(!id_start.contains('1')); + /// assert!(!id_start.contains('_')); + /// assert!(id_start.contains('ߝ')); // U+07DD NKO LETTER FA + /// assert!(!id_start.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X + /// assert!(id_start.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + /// ``` + + pub const fn id_start() => SINGLETON_PROPS_IDS_V1; + pub fn load_id_start(); +} + +make_code_point_set_property! { + property: "Ids_Binary_Operator"; + marker: IdsBinaryOperatorProperty; + keyed_data_marker: IdsBinaryOperatorV1Marker; + func: + /// Characters used in Ideographic Description Sequences + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let ids_binary_operator = sets::ids_binary_operator(); + /// + /// assert!(ids_binary_operator.contains32(0x2FF5)); // IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE + /// assert!(!ids_binary_operator.contains32(0x3006)); // IDEOGRAPHIC CLOSING MARK + /// ``` + + pub const fn ids_binary_operator() => SINGLETON_PROPS_IDSB_V1; + pub fn load_ids_binary_operator(); +} + +make_code_point_set_property! { + property: "Ids_Trinary_Operator"; + marker: IdsTrinaryOperatorProperty; + keyed_data_marker: IdsTrinaryOperatorV1Marker; + func: + /// Characters used in Ideographic Description Sequences + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let ids_trinary_operator = sets::ids_trinary_operator(); + /// + /// assert!(ids_trinary_operator.contains32(0x2FF2)); // IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT + /// assert!(ids_trinary_operator.contains32(0x2FF3)); // IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW + /// assert!(!ids_trinary_operator.contains32(0x2FF4)); + /// assert!(!ids_trinary_operator.contains32(0x2FF5)); // IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE + /// assert!(!ids_trinary_operator.contains32(0x3006)); // IDEOGRAPHIC CLOSING MARK + /// ``` + + pub const fn ids_trinary_operator() => SINGLETON_PROPS_IDST_V1; + pub fn load_ids_trinary_operator(); +} + +make_code_point_set_property! { + property: "Join_Control"; + marker: JoinControlProperty; + keyed_data_marker: JoinControlV1Marker; + func: + /// Format control characters which have specific functions for control of cursive joining + /// and ligation + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let join_control = sets::join_control(); + /// + /// assert!(join_control.contains32(0x200C)); // ZERO WIDTH NON-JOINER + /// assert!(join_control.contains32(0x200D)); // ZERO WIDTH JOINER + /// assert!(!join_control.contains32(0x200E)); + /// ``` + + pub const fn join_control() => SINGLETON_PROPS_JOIN_C_V1; + pub fn load_join_control(); +} + +make_code_point_set_property! { + property: "Logical_Order_Exception"; + marker: LogicalOrderExceptionProperty; + keyed_data_marker: LogicalOrderExceptionV1Marker; + func: + /// A small number of spacing vowel letters occurring in certain Southeast Asian scripts such as Thai and Lao + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let logical_order_exception = sets::logical_order_exception(); + /// + /// assert!(logical_order_exception.contains('ແ')); // U+0EC1 LAO VOWEL SIGN EI + /// assert!(!logical_order_exception.contains('ະ')); // U+0EB0 LAO VOWEL SIGN A + /// ``` + + pub const fn logical_order_exception() => SINGLETON_PROPS_LOE_V1; + pub fn load_logical_order_exception(); +} + +make_code_point_set_property! { + property: "Lowercase"; + marker: LowercaseProperty; + keyed_data_marker: LowercaseV1Marker; + func: + /// Lowercase characters + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let lowercase = sets::lowercase(); + /// + /// assert!(lowercase.contains('a')); + /// assert!(!lowercase.contains('A')); + /// ``` + + pub const fn lowercase() => SINGLETON_PROPS_LOWER_V1; + pub fn load_lowercase(); +} + +make_code_point_set_property! { + property: "Math"; + marker: MathProperty; + keyed_data_marker: MathV1Marker; + func: + /// Characters used in mathematical notation + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let math = sets::math(); + /// + /// assert!(math.contains('=')); + /// assert!(math.contains('+')); + /// assert!(!math.contains('-')); + /// assert!(math.contains('−')); // U+2212 MINUS SIGN + /// assert!(!math.contains('/')); + /// assert!(math.contains('∕')); // U+2215 DIVISION SLASH + /// ``` + + pub const fn math() => SINGLETON_PROPS_MATH_V1; + pub fn load_math(); +} + +make_code_point_set_property! { + property: "Noncharacter_Code_Point"; + marker: NoncharacterCodePointProperty; + keyed_data_marker: NoncharacterCodePointV1Marker; + func: + /// Code points permanently reserved for internal use + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let noncharacter_code_point = sets::noncharacter_code_point(); + /// + /// assert!(noncharacter_code_point.contains32(0xFDD0)); + /// assert!(noncharacter_code_point.contains32(0xFFFF)); + /// assert!(!noncharacter_code_point.contains32(0x10000)); + /// ``` + + pub const fn noncharacter_code_point() => SINGLETON_PROPS_NCHAR_V1; + pub fn load_noncharacter_code_point(); +} + +make_code_point_set_property! { + property: "NFC_Inert"; + marker: NfcInertProperty; + keyed_data_marker: NfcInertV1Marker; + func: + /// Characters that are inert under NFC, i.e., they do not interact with adjacent characters + + pub const fn nfc_inert() => SINGLETON_PROPS_NFCINERT_V1; + pub fn load_nfc_inert(); +} + +make_code_point_set_property! { + property: "NFD_Inert"; + marker: NfdInertProperty; + keyed_data_marker: NfdInertV1Marker; + func: + /// Characters that are inert under NFD, i.e., they do not interact with adjacent characters + + pub const fn nfd_inert() => SINGLETON_PROPS_NFDINERT_V1; + pub fn load_nfd_inert(); +} + +make_code_point_set_property! { + property: "NFKC_Inert"; + marker: NfkcInertProperty; + keyed_data_marker: NfkcInertV1Marker; + func: + /// Characters that are inert under NFKC, i.e., they do not interact with adjacent characters + + pub const fn nfkc_inert() => SINGLETON_PROPS_NFKCINERT_V1; + pub fn load_nfkc_inert(); +} + +make_code_point_set_property! { + property: "NFKD_Inert"; + marker: NfkdInertProperty; + keyed_data_marker: NfkdInertV1Marker; + func: + /// Characters that are inert under NFKD, i.e., they do not interact with adjacent characters + + pub const fn nfkd_inert() => SINGLETON_PROPS_NFKDINERT_V1; + pub fn load_nfkd_inert(); +} + +make_code_point_set_property! { + property: "Pattern_Syntax"; + marker: PatternSyntaxProperty; + keyed_data_marker: PatternSyntaxV1Marker; + func: + /// Characters used as syntax in patterns (such as regular expressions). See [`Unicode + /// Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more + /// details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let pattern_syntax = sets::pattern_syntax(); + /// + /// assert!(pattern_syntax.contains('{')); + /// assert!(pattern_syntax.contains('⇒')); // U+21D2 RIGHTWARDS DOUBLE ARROW + /// assert!(!pattern_syntax.contains('0')); + /// ``` + + pub const fn pattern_syntax() => SINGLETON_PROPS_PAT_SYN_V1; + pub fn load_pattern_syntax(); +} + +make_code_point_set_property! { + property: "Pattern_White_Space"; + marker: PatternWhiteSpaceProperty; + keyed_data_marker: PatternWhiteSpaceV1Marker; + func: + /// Characters used as whitespace in patterns (such as regular expressions). See + /// [`Unicode Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for + /// more details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let pattern_white_space = sets::pattern_white_space(); + /// + /// assert!(pattern_white_space.contains(' ')); + /// assert!(pattern_white_space.contains32(0x2029)); // PARAGRAPH SEPARATOR + /// assert!(pattern_white_space.contains32(0x000A)); // NEW LINE + /// assert!(!pattern_white_space.contains32(0x00A0)); // NO-BREAK SPACE + /// ``` + + pub const fn pattern_white_space() => SINGLETON_PROPS_PAT_WS_V1; + pub fn load_pattern_white_space(); +} + +make_code_point_set_property! { + property: "Prepended_Concatenation_Mark"; + marker: PrependedConcatenationMarkProperty; + keyed_data_marker: PrependedConcatenationMarkV1Marker; + func: + /// A small class of visible format controls, which precede and then span a sequence of + /// other characters, usually digits. + + pub const fn prepended_concatenation_mark() => SINGLETON_PROPS_PCM_V1; + pub fn load_prepended_concatenation_mark(); +} + +make_code_point_set_property! { + property: "Print"; + marker: PrintProperty; + keyed_data_marker: PrintV1Marker; + func: + /// Printable characters (visible characters and whitespace). + /// This is defined for POSIX compatibility. + + pub const fn print() => SINGLETON_PROPS_PRINT_V1; + pub fn load_print(); +} + +make_code_point_set_property! { + property: "Quotation_Mark"; + marker: QuotationMarkProperty; + keyed_data_marker: QuotationMarkV1Marker; + func: + /// Punctuation characters that function as quotation marks. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let quotation_mark = sets::quotation_mark(); + /// + /// assert!(quotation_mark.contains('\'')); + /// assert!(quotation_mark.contains('„')); // U+201E DOUBLE LOW-9 QUOTATION MARK + /// assert!(!quotation_mark.contains('<')); + /// ``` + + pub const fn quotation_mark() => SINGLETON_PROPS_QMARK_V1; + pub fn load_quotation_mark(); +} + +make_code_point_set_property! { + property: "Radical"; + marker: RadicalProperty; + keyed_data_marker: RadicalV1Marker; + func: + /// Characters used in the definition of Ideographic Description Sequences + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let radical = sets::radical(); + /// + /// assert!(radical.contains('⺆')); // U+2E86 CJK RADICAL BOX + /// assert!(!radical.contains('丹')); // U+F95E CJK COMPATIBILITY IDEOGRAPH-F95E + /// ``` + + pub const fn radical() => SINGLETON_PROPS_RADICAL_V1; + pub fn load_radical(); +} + +make_code_point_set_property! { + property: "Regional_Indicator"; + marker: RegionalIndicatorProperty; + keyed_data_marker: RegionalIndicatorV1Marker; + func: + /// Regional indicator characters, U+1F1E6..U+1F1FF + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let regional_indicator = sets::regional_indicator(); + /// + /// assert!(regional_indicator.contains('🇹')); // U+1F1F9 REGIONAL INDICATOR SYMBOL LETTER T + /// assert!(!regional_indicator.contains('Ⓣ')); // U+24C9 CIRCLED LATIN CAPITAL LETTER T + /// assert!(!regional_indicator.contains('T')); + /// ``` + + pub const fn regional_indicator() => SINGLETON_PROPS_RI_V1; + pub fn load_regional_indicator(); +} + +make_code_point_set_property! { + property: "Soft_Dotted"; + marker: SoftDottedProperty; + keyed_data_marker: SoftDottedV1Marker; + func: + /// Characters with a "soft dot", like i or j. An accent placed on these characters causes + /// the dot to disappear. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let soft_dotted = sets::soft_dotted(); + /// + /// assert!(soft_dotted.contains('і')); //U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I + /// assert!(!soft_dotted.contains('ı')); // U+0131 LATIN SMALL LETTER DOTLESS I + /// ``` + + pub const fn soft_dotted() => SINGLETON_PROPS_SD_V1; + pub fn load_soft_dotted(); +} + +make_code_point_set_property! { + property: "Segment_Starter"; + marker: SegmentStarterProperty; + keyed_data_marker: SegmentStarterV1Marker; + func: + /// Characters that are starters in terms of Unicode normalization and combining character + /// sequences + + pub const fn segment_starter() => SINGLETON_PROPS_SEGSTART_V1; + pub fn load_segment_starter(); +} + +make_code_point_set_property! { + property: "Case_Sensitive"; + marker: CaseSensitiveProperty; + keyed_data_marker: CaseSensitiveV1Marker; + func: + /// Characters that are either the source of a case mapping or in the target of a case + /// mapping + + pub const fn case_sensitive() => SINGLETON_PROPS_SENSITIVE_V1; + pub fn load_case_sensitive(); +} + +make_code_point_set_property! { + property: "Sentence_Terminal"; + marker: SentenceTerminalProperty; + keyed_data_marker: SentenceTerminalV1Marker; + func: + /// Punctuation characters that generally mark the end of sentences + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let sentence_terminal = sets::sentence_terminal(); + /// + /// assert!(sentence_terminal.contains('.')); + /// assert!(sentence_terminal.contains('?')); + /// assert!(sentence_terminal.contains('᪨')); // U+1AA8 TAI THAM SIGN KAAN + /// assert!(!sentence_terminal.contains(',')); + /// assert!(!sentence_terminal.contains('¿')); // U+00BF INVERTED QUESTION MARK + /// ``` + + pub const fn sentence_terminal() => SINGLETON_PROPS_STERM_V1; + pub fn load_sentence_terminal(); +} + +make_code_point_set_property! { + property: "Terminal_Punctuation"; + marker: TerminalPunctuationProperty; + keyed_data_marker: TerminalPunctuationV1Marker; + func: + /// Punctuation characters that generally mark the end of textual units + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let terminal_punctuation = sets::terminal_punctuation(); + /// + /// assert!(terminal_punctuation.contains('.')); + /// assert!(terminal_punctuation.contains('?')); + /// assert!(terminal_punctuation.contains('᪨')); // U+1AA8 TAI THAM SIGN KAAN + /// assert!(terminal_punctuation.contains(',')); + /// assert!(!terminal_punctuation.contains('¿')); // U+00BF INVERTED QUESTION MARK + /// ``` + + pub const fn terminal_punctuation() => SINGLETON_PROPS_TERM_V1; + pub fn load_terminal_punctuation(); +} + +make_code_point_set_property! { + property: "Unified_Ideograph"; + marker: UnifiedIdeographProperty; + keyed_data_marker: UnifiedIdeographV1Marker; + func: + /// A property which specifies the exact set of Unified CJK Ideographs in the standard + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let unified_ideograph = sets::unified_ideograph(); + /// + /// assert!(unified_ideograph.contains('川')); // U+5DDD CJK UNIFIED IDEOGRAPH-5DDD + /// assert!(unified_ideograph.contains('木')); // U+6728 CJK UNIFIED IDEOGRAPH-6728 + /// assert!(!unified_ideograph.contains('𛅸')); // U+1B178 NUSHU CHARACTER-1B178 + /// ``` + + pub const fn unified_ideograph() => SINGLETON_PROPS_UIDEO_V1; + pub fn load_unified_ideograph(); +} + +make_code_point_set_property! { + property: "Uppercase"; + marker: UppercaseProperty; + keyed_data_marker: UppercaseV1Marker; + func: + /// Uppercase characters + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let uppercase = sets::uppercase(); + /// + /// assert!(uppercase.contains('U')); + /// assert!(!uppercase.contains('u')); + /// ``` + + pub const fn uppercase() => SINGLETON_PROPS_UPPER_V1; + pub fn load_uppercase(); +} + +make_code_point_set_property! { + property: "Variation_Selector"; + marker: VariationSelectorProperty; + keyed_data_marker: VariationSelectorV1Marker; + func: + /// Characters that are Variation Selectors. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let variation_selector = sets::variation_selector(); + /// + /// assert!(variation_selector.contains32(0x180D)); // MONGOLIAN FREE VARIATION SELECTOR THREE + /// assert!(!variation_selector.contains32(0x303E)); // IDEOGRAPHIC VARIATION INDICATOR + /// assert!(variation_selector.contains32(0xFE0F)); // VARIATION SELECTOR-16 + /// assert!(!variation_selector.contains32(0xFE10)); // PRESENTATION FORM FOR VERTICAL COMMA + /// assert!(variation_selector.contains32(0xE01EF)); // VARIATION SELECTOR-256 + /// ``` + + pub const fn variation_selector() => SINGLETON_PROPS_VS_V1; + pub fn load_variation_selector(); +} + +make_code_point_set_property! { + property: "White_Space"; + marker: WhiteSpaceProperty; + keyed_data_marker: WhiteSpaceV1Marker; + func: + /// Spaces, separator characters and other control characters which should be treated by + /// programming languages as "white space" for the purpose of parsing elements + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let white_space = sets::white_space(); + /// + /// assert!(white_space.contains(' ')); + /// assert!(white_space.contains32(0x000A)); // NEW LINE + /// assert!(white_space.contains32(0x00A0)); // NO-BREAK SPACE + /// assert!(!white_space.contains32(0x200B)); // ZERO WIDTH SPACE + /// ``` + + pub const fn white_space() => SINGLETON_PROPS_WSPACE_V1; + pub fn load_white_space(); +} + +make_code_point_set_property! { + property: "Xdigit"; + marker: XdigitProperty; + keyed_data_marker: XdigitV1Marker; + func: + /// Hexadecimal digits + /// This is defined for POSIX compatibility. + + pub const fn xdigit() => SINGLETON_PROPS_XDIGIT_V1; + pub fn load_xdigit(); +} + +make_code_point_set_property! { + property: "XID_Continue"; + marker: XidContinueProperty; + keyed_data_marker: XidContinueV1Marker; + func: + /// Characters that can come after the first character in an identifier. See [`Unicode Standard Annex + /// #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let xid_continue = sets::xid_continue(); + /// + /// assert!(xid_continue.contains('x')); + /// assert!(xid_continue.contains('1')); + /// assert!(xid_continue.contains('_')); + /// assert!(xid_continue.contains('ߝ')); // U+07DD NKO LETTER FA + /// assert!(!xid_continue.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X + /// assert!(!xid_continue.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + /// ``` + + pub const fn xid_continue() => SINGLETON_PROPS_XIDC_V1; + pub fn load_xid_continue(); +} + +make_code_point_set_property! { + property: "XID_Start"; + marker: XidStartProperty; + keyed_data_marker: XidStartV1Marker; + func: + /// Characters that can begin an identifier. See [`Unicode + /// Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more + /// details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let xid_start = sets::xid_start(); + /// + /// assert!(xid_start.contains('x')); + /// assert!(!xid_start.contains('1')); + /// assert!(!xid_start.contains('_')); + /// assert!(xid_start.contains('ߝ')); // U+07DD NKO LETTER FA + /// assert!(!xid_start.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X + /// assert!(!xid_start.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + /// ``` + + pub const fn xid_start() => SINGLETON_PROPS_XIDS_V1; + pub fn load_xid_start(); +} + +// +// Binary property getter fns +// (data as sets of strings + code points) +// + +macro_rules! make_unicode_set_property { + ( + // currently unused + property: $property:expr; + // currently unused + marker: $marker_name:ident; + keyed_data_marker: $keyed_data_marker:ty; + func: + $(#[$doc:meta])+ + $cvis:vis const fn $constname:ident() => $singleton:ident; + $vis:vis fn $funcname:ident(); + ) => { + #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")] + $vis fn $funcname( + provider: &(impl DataProvider<$keyed_data_marker> + ?Sized) + ) -> Result<UnicodeSetData, PropertiesError> { + Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(UnicodeSetData::from_data)?) + } + $(#[$doc])* + #[cfg(feature = "compiled_data")] + $cvis const fn $constname() -> UnicodeSetDataBorrowed<'static> { + UnicodeSetDataBorrowed { + set: crate::provider::Baked::$singleton + } + } + } +} + +make_unicode_set_property! { + property: "Basic_Emoji"; + marker: BasicEmojiProperty; + keyed_data_marker: BasicEmojiV1Marker; + func: + /// Characters and character sequences intended for general-purpose, independent, direct input. + /// See [`Unicode Technical Standard #51`](https://unicode.org/reports/tr51/) for more + /// details. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + /// + /// # Example + /// + /// ``` + /// use icu_properties::sets; + /// + /// let basic_emoji = sets::basic_emoji(); + /// + /// assert!(!basic_emoji.contains32(0x0020)); + /// assert!(!basic_emoji.contains_char('\n')); + /// assert!(basic_emoji.contains_char('🦃')); // U+1F983 TURKEY + /// assert!(basic_emoji.contains("\u{1F983}")); + /// assert!(basic_emoji.contains("\u{1F6E4}\u{FE0F}")); // railway track + /// assert!(!basic_emoji.contains("\u{0033}\u{FE0F}\u{20E3}")); // Emoji_Keycap_Sequence, keycap 3 + /// ``` + pub const fn basic_emoji() => SINGLETON_PROPS_BASIC_EMOJI_V1; + pub fn load_basic_emoji(); +} + +// +// Enumerated property getter fns +// + +/// A version of [`for_general_category_group()`] that uses custom data provided by a [`DataProvider`]. +/// +/// [📚 Help choosing a constructor](icu_provider::constructors) +pub fn load_for_general_category_group( + provider: &(impl DataProvider<GeneralCategoryV1Marker> + ?Sized), + enum_val: GeneralCategoryGroup, +) -> Result<CodePointSetData, PropertiesError> { + let gc_map_payload = maps::load_general_category(provider)?; + let gc_map = gc_map_payload.as_borrowed(); + let matching_gc_ranges = gc_map + .iter_ranges() + .filter(|cpm_range| (1 << cpm_range.value as u32) & enum_val.0 != 0) + .map(|cpm_range| cpm_range.range); + let set = CodePointInversionList::from_iter(matching_gc_ranges); + Ok(CodePointSetData::from_code_point_inversion_list(set)) +} + +/// Return a [`CodePointSetData`] for a value or a grouping of values of the General_Category property. See [`GeneralCategoryGroup`]. +/// +/// ✨ *Enabled with the `compiled_data` Cargo feature.* +/// +/// [📚 Help choosing a constructor](icu_provider::constructors) +#[cfg(feature = "compiled_data")] +pub fn for_general_category_group(enum_val: GeneralCategoryGroup) -> CodePointSetData { + let matching_gc_ranges = maps::general_category() + .iter_ranges() + .filter(|cpm_range| (1 << cpm_range.value as u32) & enum_val.0 != 0) + .map(|cpm_range| cpm_range.range); + let set = CodePointInversionList::from_iter(matching_gc_ranges); + CodePointSetData::from_code_point_inversion_list(set) +} + +/// Returns a type capable of looking up values for a property specified as a string, as long as it is a +/// [binary property listed in ECMA-262][ecma], using strict matching on the names in the spec. +/// +/// This handles every property required by ECMA-262 `/u` regular expressions, except for: +/// +/// - `Script` and `General_Category`: handle these directly with [`maps::load_general_category()`] and +/// [`maps::load_script()`]. +/// using property values parsed via [`GeneralCategory::get_name_to_enum_mapper()`] and [`Script::get_name_to_enum_mapper()`] +/// if necessary. +/// - `Script_Extensions`: handle this directly using APIs from [`crate::script`], like [`script::load_script_with_extensions_unstable()`] +/// - `General_Category` mask values: Handle this alongside `General_Category` using [`GeneralCategoryGroup`], +/// using property values parsed via [`GeneralCategoryGroup::get_name_to_enum_mapper()`] if necessary +/// - `Assigned`, `All`, and `ASCII` pseudoproperties: Handle these using their equivalent sets: +/// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]` +/// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`). +/// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]` +/// - `General_Category` property values can themselves be treated like properties using a shorthand in ECMA262, +/// simply create the corresponding `GeneralCategory` set. +/// +/// ✨ *Enabled with the `compiled_data` Cargo feature.* +/// +/// [📚 Help choosing a constructor](icu_provider::constructors) +/// +/// ``` +/// use icu::properties::sets; +/// +/// let emoji = sets::load_for_ecma262("Emoji").expect("loading data failed"); +/// +/// assert!(emoji.contains('🔥')); // U+1F525 FIRE +/// assert!(!emoji.contains('V')); +/// ``` +/// +/// [ecma]: https://tc39.es/ecma262/#table-binary-unicode-properties +#[cfg(feature = "compiled_data")] +pub fn load_for_ecma262(name: &str) -> Result<CodePointSetDataBorrowed<'static>, PropertiesError> { + use crate::runtime::UnicodeProperty; + + let prop = if let Some(prop) = UnicodeProperty::parse_ecma262_name(name) { + prop + } else { + return Err(PropertiesError::UnexpectedPropertyName); + }; + Ok(match prop { + UnicodeProperty::AsciiHexDigit => ascii_hex_digit(), + UnicodeProperty::Alphabetic => alphabetic(), + UnicodeProperty::BidiControl => bidi_control(), + UnicodeProperty::BidiMirrored => bidi_mirrored(), + UnicodeProperty::CaseIgnorable => case_ignorable(), + UnicodeProperty::Cased => cased(), + UnicodeProperty::ChangesWhenCasefolded => changes_when_casefolded(), + UnicodeProperty::ChangesWhenCasemapped => changes_when_casemapped(), + UnicodeProperty::ChangesWhenLowercased => changes_when_lowercased(), + UnicodeProperty::ChangesWhenNfkcCasefolded => changes_when_nfkc_casefolded(), + UnicodeProperty::ChangesWhenTitlecased => changes_when_titlecased(), + UnicodeProperty::ChangesWhenUppercased => changes_when_uppercased(), + UnicodeProperty::Dash => dash(), + UnicodeProperty::DefaultIgnorableCodePoint => default_ignorable_code_point(), + UnicodeProperty::Deprecated => deprecated(), + UnicodeProperty::Diacritic => diacritic(), + UnicodeProperty::Emoji => emoji(), + UnicodeProperty::EmojiComponent => emoji_component(), + UnicodeProperty::EmojiModifier => emoji_modifier(), + UnicodeProperty::EmojiModifierBase => emoji_modifier_base(), + UnicodeProperty::EmojiPresentation => emoji_presentation(), + UnicodeProperty::ExtendedPictographic => extended_pictographic(), + UnicodeProperty::Extender => extender(), + UnicodeProperty::GraphemeBase => grapheme_base(), + UnicodeProperty::GraphemeExtend => grapheme_extend(), + UnicodeProperty::HexDigit => hex_digit(), + UnicodeProperty::IdsBinaryOperator => ids_binary_operator(), + UnicodeProperty::IdsTrinaryOperator => ids_trinary_operator(), + UnicodeProperty::IdContinue => id_continue(), + UnicodeProperty::IdStart => id_start(), + UnicodeProperty::Ideographic => ideographic(), + UnicodeProperty::JoinControl => join_control(), + UnicodeProperty::LogicalOrderException => logical_order_exception(), + UnicodeProperty::Lowercase => lowercase(), + UnicodeProperty::Math => math(), + UnicodeProperty::NoncharacterCodePoint => noncharacter_code_point(), + UnicodeProperty::PatternSyntax => pattern_syntax(), + UnicodeProperty::PatternWhiteSpace => pattern_white_space(), + UnicodeProperty::QuotationMark => quotation_mark(), + UnicodeProperty::Radical => radical(), + UnicodeProperty::RegionalIndicator => regional_indicator(), + UnicodeProperty::SentenceTerminal => sentence_terminal(), + UnicodeProperty::SoftDotted => soft_dotted(), + UnicodeProperty::TerminalPunctuation => terminal_punctuation(), + UnicodeProperty::UnifiedIdeograph => unified_ideograph(), + UnicodeProperty::Uppercase => uppercase(), + UnicodeProperty::VariationSelector => variation_selector(), + UnicodeProperty::WhiteSpace => white_space(), + UnicodeProperty::XidContinue => xid_continue(), + UnicodeProperty::XidStart => xid_start(), + _ => return Err(PropertiesError::UnexpectedPropertyName), + }) +} + +icu_provider::gen_any_buffer_data_constructors!( + locale: skip, + name: &str, + result: Result<CodePointSetData, PropertiesError>, + #[cfg(skip)] + functions: [ + load_for_ecma262, + load_for_ecma262_with_any_provider, + load_for_ecma262_with_buffer_provider, + load_for_ecma262_unstable, + ] +); + +#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, load_for_ecma262)] +pub fn load_for_ecma262_unstable<P>( + provider: &P, + name: &str, +) -> Result<CodePointSetData, PropertiesError> +where + P: ?Sized + + DataProvider<AsciiHexDigitV1Marker> + + DataProvider<AlphabeticV1Marker> + + DataProvider<BidiControlV1Marker> + + DataProvider<BidiMirroredV1Marker> + + DataProvider<CaseIgnorableV1Marker> + + DataProvider<CasedV1Marker> + + DataProvider<ChangesWhenCasefoldedV1Marker> + + DataProvider<ChangesWhenCasemappedV1Marker> + + DataProvider<ChangesWhenLowercasedV1Marker> + + DataProvider<ChangesWhenNfkcCasefoldedV1Marker> + + DataProvider<ChangesWhenTitlecasedV1Marker> + + DataProvider<ChangesWhenUppercasedV1Marker> + + DataProvider<DashV1Marker> + + DataProvider<DefaultIgnorableCodePointV1Marker> + + DataProvider<DeprecatedV1Marker> + + DataProvider<DiacriticV1Marker> + + DataProvider<EmojiV1Marker> + + DataProvider<EmojiComponentV1Marker> + + DataProvider<EmojiModifierV1Marker> + + DataProvider<EmojiModifierBaseV1Marker> + + DataProvider<EmojiPresentationV1Marker> + + DataProvider<ExtendedPictographicV1Marker> + + DataProvider<ExtenderV1Marker> + + DataProvider<GraphemeBaseV1Marker> + + DataProvider<GraphemeExtendV1Marker> + + DataProvider<HexDigitV1Marker> + + DataProvider<IdsBinaryOperatorV1Marker> + + DataProvider<IdsTrinaryOperatorV1Marker> + + DataProvider<IdContinueV1Marker> + + DataProvider<IdStartV1Marker> + + DataProvider<IdeographicV1Marker> + + DataProvider<JoinControlV1Marker> + + DataProvider<LogicalOrderExceptionV1Marker> + + DataProvider<LowercaseV1Marker> + + DataProvider<MathV1Marker> + + DataProvider<NoncharacterCodePointV1Marker> + + DataProvider<PatternSyntaxV1Marker> + + DataProvider<PatternWhiteSpaceV1Marker> + + DataProvider<QuotationMarkV1Marker> + + DataProvider<RadicalV1Marker> + + DataProvider<RegionalIndicatorV1Marker> + + DataProvider<SentenceTerminalV1Marker> + + DataProvider<SoftDottedV1Marker> + + DataProvider<TerminalPunctuationV1Marker> + + DataProvider<UnifiedIdeographV1Marker> + + DataProvider<UppercaseV1Marker> + + DataProvider<VariationSelectorV1Marker> + + DataProvider<WhiteSpaceV1Marker> + + DataProvider<XidContinueV1Marker> + + DataProvider<XidStartV1Marker>, +{ + use crate::runtime::UnicodeProperty; + + let prop = if let Some(prop) = UnicodeProperty::parse_ecma262_name(name) { + prop + } else { + return Err(PropertiesError::UnexpectedPropertyName); + }; + match prop { + UnicodeProperty::AsciiHexDigit => load_ascii_hex_digit(provider), + UnicodeProperty::Alphabetic => load_alphabetic(provider), + UnicodeProperty::BidiControl => load_bidi_control(provider), + UnicodeProperty::BidiMirrored => load_bidi_mirrored(provider), + UnicodeProperty::CaseIgnorable => load_case_ignorable(provider), + UnicodeProperty::Cased => load_cased(provider), + UnicodeProperty::ChangesWhenCasefolded => load_changes_when_casefolded(provider), + UnicodeProperty::ChangesWhenCasemapped => load_changes_when_casemapped(provider), + UnicodeProperty::ChangesWhenLowercased => load_changes_when_lowercased(provider), + UnicodeProperty::ChangesWhenNfkcCasefolded => load_changes_when_nfkc_casefolded(provider), + UnicodeProperty::ChangesWhenTitlecased => load_changes_when_titlecased(provider), + UnicodeProperty::ChangesWhenUppercased => load_changes_when_uppercased(provider), + UnicodeProperty::Dash => load_dash(provider), + UnicodeProperty::DefaultIgnorableCodePoint => load_default_ignorable_code_point(provider), + UnicodeProperty::Deprecated => load_deprecated(provider), + UnicodeProperty::Diacritic => load_diacritic(provider), + UnicodeProperty::Emoji => load_emoji(provider), + UnicodeProperty::EmojiComponent => load_emoji_component(provider), + UnicodeProperty::EmojiModifier => load_emoji_modifier(provider), + UnicodeProperty::EmojiModifierBase => load_emoji_modifier_base(provider), + UnicodeProperty::EmojiPresentation => load_emoji_presentation(provider), + UnicodeProperty::ExtendedPictographic => load_extended_pictographic(provider), + UnicodeProperty::Extender => load_extender(provider), + UnicodeProperty::GraphemeBase => load_grapheme_base(provider), + UnicodeProperty::GraphemeExtend => load_grapheme_extend(provider), + UnicodeProperty::HexDigit => load_hex_digit(provider), + UnicodeProperty::IdsBinaryOperator => load_ids_binary_operator(provider), + UnicodeProperty::IdsTrinaryOperator => load_ids_trinary_operator(provider), + UnicodeProperty::IdContinue => load_id_continue(provider), + UnicodeProperty::IdStart => load_id_start(provider), + UnicodeProperty::Ideographic => load_ideographic(provider), + UnicodeProperty::JoinControl => load_join_control(provider), + UnicodeProperty::LogicalOrderException => load_logical_order_exception(provider), + UnicodeProperty::Lowercase => load_lowercase(provider), + UnicodeProperty::Math => load_math(provider), + UnicodeProperty::NoncharacterCodePoint => load_noncharacter_code_point(provider), + UnicodeProperty::PatternSyntax => load_pattern_syntax(provider), + UnicodeProperty::PatternWhiteSpace => load_pattern_white_space(provider), + UnicodeProperty::QuotationMark => load_quotation_mark(provider), + UnicodeProperty::Radical => load_radical(provider), + UnicodeProperty::RegionalIndicator => load_regional_indicator(provider), + UnicodeProperty::SentenceTerminal => load_sentence_terminal(provider), + UnicodeProperty::SoftDotted => load_soft_dotted(provider), + UnicodeProperty::TerminalPunctuation => load_terminal_punctuation(provider), + UnicodeProperty::UnifiedIdeograph => load_unified_ideograph(provider), + UnicodeProperty::Uppercase => load_uppercase(provider), + UnicodeProperty::VariationSelector => load_variation_selector(provider), + UnicodeProperty::WhiteSpace => load_white_space(provider), + UnicodeProperty::XidContinue => load_xid_continue(provider), + UnicodeProperty::XidStart => load_xid_start(provider), + _ => Err(PropertiesError::UnexpectedPropertyName), + } +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_general_category() { + use icu::properties::sets; + use icu::properties::GeneralCategoryGroup; + + let digits_data = sets::for_general_category_group(GeneralCategoryGroup::Number); + let digits = digits_data.as_borrowed(); + + assert!(digits.contains('5')); + assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE + assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE + + assert!(!digits.contains('A')); + } + + #[test] + fn test_script() { + use icu::properties::maps; + use icu::properties::Script; + + let thai_data = maps::script().get_set_for_value(Script::Thai); + let thai = thai_data.as_borrowed(); + + assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI + assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO + + assert!(!thai.contains('A')); + assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT + } + + #[test] + fn test_gc_groupings() { + use icu::properties::{maps, sets}; + use icu::properties::{GeneralCategory, GeneralCategoryGroup}; + use icu_collections::codepointinvlist::CodePointInversionListBuilder; + + let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| { + let category_set = sets::for_general_category_group(category); + let category_set = category_set + .as_code_point_inversion_list() + .expect("The data should be valid"); + + let mut builder = CodePointInversionListBuilder::new(); + for subcategory in subcategories { + let gc_set_data = &maps::general_category().get_set_for_value(*subcategory); + let gc_set = gc_set_data.as_borrowed(); + for range in gc_set.iter_ranges() { + builder.add_range_u32(&range); + } + } + let combined_set = builder.build(); + println!("{category:?} {subcategories:?}"); + assert_eq!( + category_set.get_inversion_list_vec(), + combined_set.get_inversion_list_vec() + ); + }; + + test_group( + GeneralCategoryGroup::Letter, + &[ + GeneralCategory::UppercaseLetter, + GeneralCategory::LowercaseLetter, + GeneralCategory::TitlecaseLetter, + GeneralCategory::ModifierLetter, + GeneralCategory::OtherLetter, + ], + ); + test_group( + GeneralCategoryGroup::Other, + &[ + GeneralCategory::Control, + GeneralCategory::Format, + GeneralCategory::Unassigned, + GeneralCategory::PrivateUse, + GeneralCategory::Surrogate, + ], + ); + test_group( + GeneralCategoryGroup::Mark, + &[ + GeneralCategory::SpacingMark, + GeneralCategory::EnclosingMark, + GeneralCategory::NonspacingMark, + ], + ); + test_group( + GeneralCategoryGroup::Number, + &[ + GeneralCategory::DecimalNumber, + GeneralCategory::LetterNumber, + GeneralCategory::OtherNumber, + ], + ); + test_group( + GeneralCategoryGroup::Punctuation, + &[ + GeneralCategory::ConnectorPunctuation, + GeneralCategory::DashPunctuation, + GeneralCategory::ClosePunctuation, + GeneralCategory::FinalPunctuation, + GeneralCategory::InitialPunctuation, + GeneralCategory::OtherPunctuation, + GeneralCategory::OpenPunctuation, + ], + ); + test_group( + GeneralCategoryGroup::Symbol, + &[ + GeneralCategory::CurrencySymbol, + GeneralCategory::ModifierSymbol, + GeneralCategory::MathSymbol, + GeneralCategory::OtherSymbol, + ], + ); + test_group( + GeneralCategoryGroup::Separator, + &[ + GeneralCategory::LineSeparator, + GeneralCategory::ParagraphSeparator, + GeneralCategory::SpaceSeparator, + ], + ); + } + + #[test] + fn test_gc_surrogate() { + use icu::properties::maps; + use icu::properties::GeneralCategory; + + let surrogates_data = + maps::general_category().get_set_for_value(GeneralCategory::Surrogate); + let surrogates = surrogates_data.as_borrowed(); + + assert!(surrogates.contains32(0xd800)); + assert!(surrogates.contains32(0xd900)); + assert!(surrogates.contains32(0xdfff)); + + assert!(!surrogates.contains('A')); + } +} diff --git a/third_party/rust/icu_properties/src/trievalue.rs b/third_party/rust/icu_properties/src/trievalue.rs new file mode 100644 index 0000000000..d8b65e4aa9 --- /dev/null +++ b/third_party/rust/icu_properties/src/trievalue.rs @@ -0,0 +1,248 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::provider::bidi_data::{ + CheckedBidiPairedBracketType, MirroredPairedBracketData, MirroredPairedBracketDataTryFromError, +}; +use crate::script::ScriptWithExt; +use crate::{ + BidiClass, CanonicalCombiningClass, EastAsianWidth, GeneralCategory, GeneralCategoryGroup, + GraphemeClusterBreak, IndicSyllabicCategory, LineBreak, Script, SentenceBreak, WordBreak, +}; +use core::convert::TryInto; +use core::num::TryFromIntError; +use zerovec::ule::{AsULE, RawBytesULE}; + +use icu_collections::codepointtrie::TrieValue; + +use core::convert::TryFrom; + +impl TrieValue for CanonicalCombiningClass { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for BidiClass { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for GeneralCategory { + type TryFromU32Error = &'static str; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + // If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralCategory enum. + GeneralCategory::new_from_u8(i.try_into().unwrap_or(u8::MAX)) + .ok_or("Cannot parse GeneralCategory from integer") + } + + fn to_u32(self) -> u32 { + u32::from(self as u8) + } +} + +impl TrieValue for Script { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u16::try_from(i).map(Script) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for ScriptWithExt { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u16::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for EastAsianWidth { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for LineBreak { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for GraphemeClusterBreak { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for WordBreak { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for SentenceBreak { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +impl TrieValue for CheckedBidiPairedBracketType { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + Ok(match i { + 1 => CheckedBidiPairedBracketType::Open, + 2 => CheckedBidiPairedBracketType::Close, + _ => CheckedBidiPairedBracketType::None, + }) + } +} + +impl TrieValue for IndicSyllabicCategory { + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + u8::try_from(i).map(Self) + } + + fn to_u32(self) -> u32 { + u32::from(self.0) + } +} + +// GCG is not used inside tries, but it is used in the name lookup type, and we want +// to squeeze it into a u16 for storage. Its named mask values are specced so we can +// do this in code. +// +// This is done by: +// - Single-value masks are translated to their corresponding GeneralCategory values +// - we know all of the multi-value masks and we give them special values +// - Anything else goes to 0xFF00, though this code path shouldn't be hit unless working with malformed icuexportdata +// +// In the reverse direction, unknown values go to the empty mask, but this codepath should not be hit except +// with malformed ICU4X generated data. +impl AsULE for GeneralCategoryGroup { + type ULE = RawBytesULE<2>; + fn to_unaligned(self) -> Self::ULE { + let value = gcg_to_packed_u16(self); + value.to_unaligned() + } + fn from_unaligned(ule: Self::ULE) -> Self { + let value = ule.as_unsigned_int(); + packed_u16_to_gcg(value) + } +} + +fn packed_u16_to_gcg(value: u16) -> GeneralCategoryGroup { + match value { + 0xFFFF => GeneralCategoryGroup::CasedLetter, + 0xFFFE => GeneralCategoryGroup::Letter, + 0xFFFD => GeneralCategoryGroup::Mark, + 0xFFFC => GeneralCategoryGroup::Number, + 0xFFFB => GeneralCategoryGroup::Separator, + 0xFFFA => GeneralCategoryGroup::Other, + 0xFFF9 => GeneralCategoryGroup::Punctuation, + 0xFFF8 => GeneralCategoryGroup::Symbol, + v if v < 32 => GeneralCategory::new_from_u8(v as u8) + .map(|gc| gc.into()) + .unwrap_or(GeneralCategoryGroup(0)), + // unknown values produce an empty mask + _ => GeneralCategoryGroup(0), + } +} + +fn gcg_to_packed_u16(gcg: GeneralCategoryGroup) -> u16 { + // if it's a single property, translate to that property + if gcg.0.count_ones() == 1 { + // inverse operation of a bitshift + gcg.0.trailing_zeros() as u16 + } else { + match gcg { + GeneralCategoryGroup::CasedLetter => 0xFFFF, + GeneralCategoryGroup::Letter => 0xFFFE, + GeneralCategoryGroup::Mark => 0xFFFD, + GeneralCategoryGroup::Number => 0xFFFC, + GeneralCategoryGroup::Separator => 0xFFFB, + GeneralCategoryGroup::Other => 0xFFFA, + GeneralCategoryGroup::Punctuation => 0xFFF9, + GeneralCategoryGroup::Symbol => 0xFFF8, + _ => 0xFF00, // random sentinel value + } + } +} + +impl TrieValue for GeneralCategoryGroup { + type TryFromU32Error = TryFromIntError; + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + // Even though we're dealing with u32s here, TrieValue is about converting + // trie storage types to the actual type. This type will always be a packed u16 + // in our case since the names map upcasts from u16 + u16::try_from(i).map(packed_u16_to_gcg) + } + + fn to_u32(self) -> u32 { + u32::from(gcg_to_packed_u16(self)) + } +} + +impl TrieValue for MirroredPairedBracketData { + type TryFromU32Error = MirroredPairedBracketDataTryFromError; + + fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> { + Self::try_from(i) + } +} |