summaryrefslogtreecommitdiffstats
path: root/third_party/rust/icu_properties/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/rust/icu_properties/src
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/icu_properties/src')
-rw-r--r--third_party/rust/icu_properties/src/bidi.rs139
-rw-r--r--third_party/rust/icu_properties/src/bidi_data.rs216
-rw-r--r--third_party/rust/icu_properties/src/error.rs40
-rw-r--r--third_party/rust/icu_properties/src/exemplar_chars.rs247
-rw-r--r--third_party/rust/icu_properties/src/lib.rs115
-rw-r--r--third_party/rust/icu_properties/src/maps.rs602
-rw-r--r--third_party/rust/icu_properties/src/props.rs2365
-rw-r--r--third_party/rust/icu_properties/src/provider.rs900
-rw-r--r--third_party/rust/icu_properties/src/provider/bidi_data.rs289
-rw-r--r--third_party/rust/icu_properties/src/provider/names.rs277
-rw-r--r--third_party/rust/icu_properties/src/runtime.rs360
-rw-r--r--third_party/rust/icu_properties/src/script.rs648
-rw-r--r--third_party/rust/icu_properties/src/sets.rs2381
-rw-r--r--third_party/rust/icu_properties/src/trievalue.rs248
14 files changed, 8827 insertions, 0 deletions
diff --git a/third_party/rust/icu_properties/src/bidi.rs b/third_party/rust/icu_properties/src/bidi.rs
new file mode 100644
index 0000000000..ecbd6e74ed
--- /dev/null
+++ b/third_party/rust/icu_properties/src/bidi.rs
@@ -0,0 +1,139 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! This module exposes tooling for running the [unicode bidi algorithm](https://unicode.org/reports/tr9/) using ICU4X data.
+//!
+//! `BidiClassAdapter` enables ICU4X to provide data to [`unicode-bidi`], an external crate implementing UAX #9.
+//!
+//! ✨ *Enabled with the `bidi` Cargo feature.*
+//!
+//! # Examples
+//!
+//!```
+//! use icu_properties::bidi::BidiClassAdapter;
+//! use icu_properties::maps;
+//! use unicode_bidi::BidiInfo;
+//! // This example text is defined using `concat!` because some browsers
+//! // and text editors have trouble displaying bidi strings.
+//! let text = concat!["א", // RTL#1
+//! "ב", // RTL#2
+//! "ג", // RTL#3
+//! "a", // LTR#1
+//! "b", // LTR#2
+//! "c", // LTR#3
+//! ]; //
+//!
+//!
+//! let adapter = BidiClassAdapter::new(maps::bidi_class());
+//! // Resolve embedding levels within the text. Pass `None` to detect the
+//! // paragraph level automatically.
+//!
+//! let bidi_info = BidiInfo::new_with_data_source(&adapter, text, None);
+//!
+//! // This paragraph has embedding level 1 because its first strong character is RTL.
+//! assert_eq!(bidi_info.paragraphs.len(), 1);
+//! let para = &bidi_info.paragraphs[0];
+//! assert_eq!(para.level.number(), 1);
+//! assert!(para.level.is_rtl());
+//!
+//! // Re-ordering is done after wrapping each paragraph into a sequence of
+//! // lines. For this example, I'll just use a single line that spans the
+//! // entire paragraph.
+//! let line = para.range.clone();
+//!
+//! let display = bidi_info.reorder_line(para, line);
+//! assert_eq!(display, concat!["a", // LTR#1
+//! "b", // LTR#2
+//! "c", // LTR#3
+//! "ג", // RTL#3
+//! "ב", // RTL#2
+//! "א", // RTL#1
+//! ]);
+//! ```
+
+use crate::maps::CodePointMapDataBorrowed;
+use crate::props::BidiClass;
+use unicode_bidi::data_source::BidiDataSource;
+use unicode_bidi::BidiClass as DataSourceBidiClass;
+
+/// An adapter to convert from icu4x `BidiClass` to `unicode_bidi::BidiClass`.
+///
+/// ✨ *Enabled with the `bidi` Cargo feature.*
+///
+/// # Example
+///
+/// ```
+/// use icu_collections::codepointtrie::CodePointTrie;
+/// use icu_properties::bidi::BidiClassAdapter;
+/// use icu_properties::{maps, BidiClass};
+/// use unicode_bidi::BidiClass as DataSourceBidiClass;
+/// use unicode_bidi::BidiDataSource;
+///
+/// let adapter = BidiClassAdapter::new(maps::bidi_class());
+/// assert_eq!(adapter.bidi_class('a'), DataSourceBidiClass::L);
+/// assert_eq!(adapter.bidi_class('ع'), DataSourceBidiClass::AL);
+/// ```
+#[derive(Debug)]
+pub struct BidiClassAdapter<'a> {
+ data: CodePointMapDataBorrowed<'a, BidiClass>,
+}
+
+impl<'a> BidiClassAdapter<'a> {
+ /// Creates new instance of `BidiClassAdapter`.
+ pub fn new(data: CodePointMapDataBorrowed<'a, BidiClass>) -> BidiClassAdapter<'a> {
+ BidiClassAdapter { data }
+ }
+}
+
+impl<'a> BidiDataSource for BidiClassAdapter<'a> {
+ /// Returns a [`DataSourceBidiClass`] given a unicode character.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_collections::codepointtrie::CodePointTrie;
+ /// use icu_properties::bidi::BidiClassAdapter;
+ /// use icu_properties::{maps, BidiClass};
+ /// use unicode_bidi::BidiClass as DataSourceBidiClass;
+ /// use unicode_bidi::BidiDataSource;
+ ///
+ /// let adapter = BidiClassAdapter::new(maps::bidi_class());
+ /// assert_eq!(adapter.bidi_class('a'), DataSourceBidiClass::L);
+ /// ```
+ ///
+ /// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
+ fn bidi_class(&self, c: char) -> DataSourceBidiClass {
+ let bidi_class = self.data.get(c);
+ match bidi_class {
+ BidiClass::LeftToRight => DataSourceBidiClass::L,
+ BidiClass::RightToLeft => DataSourceBidiClass::R,
+ BidiClass::EuropeanNumber => DataSourceBidiClass::EN,
+ BidiClass::EuropeanSeparator => DataSourceBidiClass::ES,
+ BidiClass::EuropeanTerminator => DataSourceBidiClass::ET,
+ BidiClass::ArabicNumber => DataSourceBidiClass::AN,
+ BidiClass::CommonSeparator => DataSourceBidiClass::CS,
+ BidiClass::ParagraphSeparator => DataSourceBidiClass::B,
+ BidiClass::SegmentSeparator => DataSourceBidiClass::S,
+ BidiClass::WhiteSpace => DataSourceBidiClass::WS,
+ BidiClass::OtherNeutral => DataSourceBidiClass::ON,
+ BidiClass::LeftToRightEmbedding => DataSourceBidiClass::LRE,
+ BidiClass::LeftToRightOverride => DataSourceBidiClass::LRO,
+ BidiClass::ArabicLetter => DataSourceBidiClass::AL,
+ BidiClass::RightToLeftEmbedding => DataSourceBidiClass::RLE,
+ BidiClass::RightToLeftOverride => DataSourceBidiClass::RLO,
+ BidiClass::PopDirectionalFormat => DataSourceBidiClass::PDF,
+ BidiClass::NonspacingMark => DataSourceBidiClass::NSM,
+ BidiClass::BoundaryNeutral => DataSourceBidiClass::BN,
+ BidiClass::FirstStrongIsolate => DataSourceBidiClass::FSI,
+ BidiClass::LeftToRightIsolate => DataSourceBidiClass::LRI,
+ BidiClass::RightToLeftIsolate => DataSourceBidiClass::RLI,
+ BidiClass::PopDirectionalIsolate => DataSourceBidiClass::PDI,
+ _ =>
+ // This must not happen.
+ {
+ DataSourceBidiClass::ON
+ }
+ }
+ }
+}
diff --git a/third_party/rust/icu_properties/src/bidi_data.rs b/third_party/rust/icu_properties/src/bidi_data.rs
new file mode 100644
index 0000000000..2356cda023
--- /dev/null
+++ b/third_party/rust/icu_properties/src/bidi_data.rs
@@ -0,0 +1,216 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Data and APIs for supporting specific Bidi properties data in an efficient structure.
+//!
+//! Supported properties are:
+//! - `Bidi_Paired_Bracket`
+//! - `Bidi_Paired_Bracket_Type`
+//! - `Bidi_Mirrored`
+//! - `Bidi_Mirroring_Glyph`
+
+use crate::provider::bidi_data::{
+ BidiAuxiliaryPropertiesV1, BidiAuxiliaryPropertiesV1Marker, CheckedBidiPairedBracketType,
+};
+use crate::PropertiesError;
+
+use icu_provider::prelude::*;
+
+/// A wrapper around certain Bidi properties data. Can be obtained via [`bidi_auxiliary_properties()`] and
+/// related getters.
+///
+/// Most useful methods are on [`BidiAuxiliaryPropertiesBorrowed`] obtained by calling [`BidiAuxiliaryProperties::as_borrowed()`]
+#[derive(Debug)]
+pub struct BidiAuxiliaryProperties {
+ data: DataPayload<BidiAuxiliaryPropertiesV1Marker>,
+}
+
+impl BidiAuxiliaryProperties {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> BidiAuxiliaryPropertiesBorrowed<'_> {
+ BidiAuxiliaryPropertiesBorrowed {
+ data: self.data.get(),
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use getters like [`bidi_auxiliary_properties()`] instead
+ pub fn from_data(data: DataPayload<BidiAuxiliaryPropertiesV1Marker>) -> Self {
+ Self { data }
+ }
+}
+
+/// This struct represents the properties Bidi_Mirrored and Bidi_Mirroring_Glyph.
+/// If Bidi_Mirroring_Glyph is not defined for a code point, then the value in the
+/// struct is `None`.
+#[derive(Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub struct BidiMirroringProperties {
+ /// Represents the Bidi_Mirroring_Glyph property value
+ pub mirroring_glyph: Option<char>,
+ /// Represents the Bidi_Mirrored property value
+ pub mirrored: bool,
+}
+
+/// The enum represents Bidi_Paired_Bracket_Type, the char represents Bidi_Paired_Bracket.
+/// Bidi_Paired_Bracket has a value of `None` when Bidi_Paired_Bracket_Type is `None`.
+#[derive(Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum BidiPairingProperties {
+ /// Represents Bidi_Paired_Bracket_Type=Open, and the Bidi_Paired_Bracket value for that code point.
+ Open(char),
+ /// Represents Bidi_Paired_Bracket_Type=Close, and the Bidi_Paired_Bracket value for that code point.
+ Close(char),
+ /// Represents Bidi_Paired_Bracket_Type=None, which cooccurs with Bidi_Paired_Bracket
+ /// being undefined for that code point.
+ None,
+}
+
+/// A borrowed wrapper around Bidi properties data, returned by
+/// [`BidiAuxiliaryProperties::as_borrowed()`]. More efficient to query.
+#[derive(Debug)]
+pub struct BidiAuxiliaryPropertiesBorrowed<'a> {
+ data: &'a BidiAuxiliaryPropertiesV1<'a>,
+}
+
+impl<'a> BidiAuxiliaryPropertiesBorrowed<'a> {
+ // The source data coming from icuexportdata will use 0 to represent the
+ // property value in cases for which the Bidi_Mirroring_Glyph property value
+ // of a code point is undefined. Since Rust types can be more expressive, we
+ // should represent these cases as None.
+ fn convert_mirroring_glyph_data(trie_data_char: char) -> Option<char> {
+ if trie_data_char as u32 == 0 {
+ None
+ } else {
+ Some(trie_data_char)
+ }
+ }
+
+ /// Return a struct for the given code point representing Bidi mirroring-related
+ /// property values. See [`BidiMirroringProperties`].
+ ///
+ /// # Examples
+ /// ```
+ /// use icu_properties::{bidi_data, bidi_data::BidiMirroringProperties};
+ ///
+ /// let bidi_data = bidi_data::bidi_auxiliary_properties();
+ ///
+ /// let open_paren = bidi_data.get32_mirroring_props('(' as u32);
+ /// assert_eq!(open_paren.mirroring_glyph, Some(')'));
+ /// assert_eq!(open_paren.mirrored, true);
+ /// let close_paren = bidi_data.get32_mirroring_props(')' as u32);
+ /// assert_eq!(close_paren.mirroring_glyph, Some('('));
+ /// assert_eq!(close_paren.mirrored, true);
+ /// let open_angle_bracket = bidi_data.get32_mirroring_props('<' as u32);
+ /// assert_eq!(open_angle_bracket.mirroring_glyph, Some('>'));
+ /// assert_eq!(open_angle_bracket.mirrored, true);
+ /// let close_angle_bracket = bidi_data.get32_mirroring_props('>' as u32);
+ /// assert_eq!(close_angle_bracket.mirroring_glyph, Some('<'));
+ /// assert_eq!(close_angle_bracket.mirrored, true);
+ /// let three = bidi_data.get32_mirroring_props('3' as u32);
+ /// assert_eq!(three.mirroring_glyph, None);
+ /// assert_eq!(three.mirrored, false);
+ /// ```
+ pub fn get32_mirroring_props(&self, code_point: u32) -> BidiMirroringProperties {
+ let bidi_aux_props = self.data.trie.get32(code_point);
+ let mirroring_glyph_opt =
+ Self::convert_mirroring_glyph_data(bidi_aux_props.mirroring_glyph);
+ BidiMirroringProperties {
+ mirroring_glyph: mirroring_glyph_opt,
+ mirrored: bidi_aux_props.mirrored,
+ }
+ }
+
+ /// Return a struct for the given code point representing Bidi bracket
+ /// pairing-related property values. See [`BidiPairingProperties`]
+ ///
+ /// # Examples
+ /// ```
+ /// use icu_properties::{bidi_data, bidi_data::BidiPairingProperties};
+ ///
+ /// let bidi_data = bidi_data::bidi_auxiliary_properties();
+ ///
+ /// let open_paren = bidi_data.get32_pairing_props('(' as u32);
+ /// assert_eq!(open_paren, BidiPairingProperties::Open(')'));
+ /// let close_paren = bidi_data.get32_pairing_props(')' as u32);
+ /// assert_eq!(close_paren, BidiPairingProperties::Close('('));
+ /// let open_angle_bracket = bidi_data.get32_pairing_props('<' as u32);
+ /// assert_eq!(open_angle_bracket, BidiPairingProperties::None);
+ /// let close_angle_bracket = bidi_data.get32_pairing_props('>' as u32);
+ /// assert_eq!(close_angle_bracket, BidiPairingProperties::None);
+ /// let three = bidi_data.get32_pairing_props('3' as u32);
+ /// assert_eq!(three, BidiPairingProperties::None);
+ /// ```
+ pub fn get32_pairing_props(&self, code_point: u32) -> BidiPairingProperties {
+ let bidi_aux_props = self.data.trie.get32(code_point);
+ let mirroring_glyph = bidi_aux_props.mirroring_glyph;
+ let paired_bracket_type = bidi_aux_props.paired_bracket_type;
+ match paired_bracket_type {
+ CheckedBidiPairedBracketType::Open => BidiPairingProperties::Open(mirroring_glyph),
+ CheckedBidiPairedBracketType::Close => BidiPairingProperties::Close(mirroring_glyph),
+ _ => BidiPairingProperties::None,
+ }
+ }
+}
+
+impl BidiAuxiliaryPropertiesBorrowed<'static> {
+ /// Cheaply converts a `BidiAuxiliaryPropertiesBorrowed<'static>` into a `BidiAuxiliaryProperties`.
+ pub const fn static_to_owned(self) -> BidiAuxiliaryProperties {
+ BidiAuxiliaryProperties {
+ data: DataPayload::from_static_ref(self.data),
+ }
+ }
+}
+
+/// Creates a [`BidiAuxiliaryPropertiesV1`] struct that represents the data for certain
+/// Bidi properties.
+///
+/// ✨ *Enabled with the `compiled_data` Cargo feature.*
+///
+/// [📚 Help choosing a constructor](icu_provider::constructors)
+///
+/// # Examples
+/// ```
+/// use icu_properties::{bidi_data, bidi_data::BidiMirroringProperties};
+///
+/// let bidi_data = bidi_data::bidi_auxiliary_properties();
+///
+/// let open_paren = bidi_data.get32_mirroring_props('(' as u32);
+/// assert_eq!(open_paren.mirroring_glyph, Some(')'));
+/// assert_eq!(open_paren.mirrored, true);
+/// ```
+#[cfg(feature = "compiled_data")]
+pub const fn bidi_auxiliary_properties() -> BidiAuxiliaryPropertiesBorrowed<'static> {
+ BidiAuxiliaryPropertiesBorrowed {
+ data: crate::provider::Baked::SINGLETON_PROPS_BIDIAUXILIARYPROPS_V1,
+ }
+}
+
+icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: skip,
+ result: Result<BidiAuxiliaryProperties, PropertiesError>,
+ #[cfg(skip)]
+ functions: [
+ bidi_auxiliary_properties,
+ load_bidi_auxiliary_properties_with_any_provider,
+ load_bidi_auxiliary_properties_with_buffer_provider,
+ load_bidi_auxiliary_properties_unstable,
+ ]
+);
+
+#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, bidi_auxiliary_properties)]
+pub fn load_bidi_auxiliary_properties_unstable(
+ provider: &(impl DataProvider<BidiAuxiliaryPropertiesV1Marker> + ?Sized),
+) -> Result<BidiAuxiliaryProperties, PropertiesError> {
+ Ok(provider
+ .load(Default::default())
+ .and_then(DataResponse::take_payload)
+ .map(BidiAuxiliaryProperties::from_data)?)
+}
diff --git a/third_party/rust/icu_properties/src/error.rs b/third_party/rust/icu_properties/src/error.rs
new file mode 100644
index 0000000000..1526e75790
--- /dev/null
+++ b/third_party/rust/icu_properties/src/error.rs
@@ -0,0 +1,40 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use displaydoc::Display;
+use icu_provider::DataError;
+
+#[cfg(doc)]
+use crate::GeneralCategoryGroup;
+#[cfg(doc)]
+use crate::Script;
+
+#[cfg(feature = "std")]
+impl std::error::Error for PropertiesError {}
+
+/// A list of error outcomes for various operations in this module.
+///
+/// Re-exported as [`Error`](crate::Error).
+#[derive(Display, Debug, Copy, Clone)]
+#[non_exhaustive]
+pub enum PropertiesError {
+ /// An error occurred while loading data
+ #[displaydoc("{0}")]
+ PropDataLoad(DataError),
+ /// An unknown value was used for the [`Script`](crate::Script) property
+ #[displaydoc("Unknown script id: {0}")]
+ UnknownScriptId(u16),
+ /// An unknown value was used for the [`GeneralCategoryGroup`](crate::GeneralCategoryGroup) property
+ #[displaydoc("Unknown general category group: {0}")]
+ UnknownGeneralCategoryGroup(u32),
+ /// An unknown or unexpected property name was used for an API dealing with properties specified as strings at runtime
+ #[displaydoc("Unexpected or unknown property name")]
+ UnexpectedPropertyName,
+}
+
+impl From<DataError> for PropertiesError {
+ fn from(e: DataError) -> Self {
+ PropertiesError::PropDataLoad(e)
+ }
+}
diff --git a/third_party/rust/icu_properties/src/exemplar_chars.rs b/third_party/rust/icu_properties/src/exemplar_chars.rs
new file mode 100644
index 0000000000..2dd7b343bf
--- /dev/null
+++ b/third_party/rust/icu_properties/src/exemplar_chars.rs
@@ -0,0 +1,247 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! This module provides APIs for getting exemplar characters for a locale.
+//!
+//! Exemplars are characters used by a language, separated into different sets.
+//! The sets are: main, auxiliary, punctuation, numbers, and index.
+//!
+//! The sets define, according to typical usage in the language,
+//! which characters occur in which contexts with which frequency.
+//! For more information, see the documentation in the
+//! [Exemplars section in Unicode Technical Standard #35](https://unicode.org/reports/tr35/tr35-general.html#Exemplars)
+//! of the LDML specification.
+//!
+//! # Examples
+//!
+//! ```
+//! use icu::locid::locale;
+//! use icu::properties::exemplar_chars;
+//!
+//! let locale = locale!("en-001").into();
+//! let data = exemplar_chars::exemplars_main(&locale)
+//! .expect("locale should be present");
+//! let exemplars_main = data.as_borrowed();
+//!
+//! assert!(exemplars_main.contains_char('a'));
+//! assert!(exemplars_main.contains_char('z'));
+//! assert!(exemplars_main.contains("a"));
+//! assert!(!exemplars_main.contains("ä"));
+//! assert!(!exemplars_main.contains("ng"));
+//! ```
+
+use crate::provider::*;
+use crate::sets::UnicodeSetData;
+use crate::PropertiesError;
+use icu_provider::prelude::*;
+
+macro_rules! make_exemplar_chars_unicode_set_property {
+ (
+ // currently unused
+ marker: $marker_name:ident;
+ keyed_data_marker: $keyed_data_marker:ty;
+ func:
+ $vis:vis fn $funcname:ident();
+ $(#[$attr:meta])*
+ $vis2:vis fn $constname:ident();
+ ) => {
+ #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")]
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ $vis fn $funcname(
+ provider: &(impl DataProvider<$keyed_data_marker> + ?Sized),
+ locale: &DataLocale,
+ ) -> Result<UnicodeSetData, PropertiesError> {
+ Ok(provider.load(
+ DataRequest {
+ locale,
+ metadata: Default::default(),
+ })
+ .and_then(DataResponse::take_payload)
+ .map(UnicodeSetData::from_data)?
+ )
+ }
+ $(#[$attr])*
+ #[cfg(feature = "compiled_data")]
+ $vis2 fn $constname(
+ locale: &DataLocale,
+ ) -> Result<UnicodeSetData, PropertiesError> {
+ Ok(UnicodeSetData::from_data(
+ DataProvider::<$keyed_data_marker>::load(
+ &crate::provider::Baked,
+ DataRequest {
+ locale,
+ metadata: Default::default(),
+ })
+ .and_then(DataResponse::take_payload)?
+ ))
+ }
+ }
+}
+
+make_exemplar_chars_unicode_set_property!(
+ marker: ExemplarCharactersMain;
+ keyed_data_marker: ExemplarCharactersMainV1Marker;
+ func:
+ pub fn load_exemplars_main();
+
+ /// Get the "main" set of exemplar characters.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::locid::locale;
+ /// use icu::properties::exemplar_chars;
+ ///
+ /// let data = exemplar_chars::exemplars_main(&locale!("en").into())
+ /// .expect("locale should be present");
+ /// let exemplars_main = data.as_borrowed();
+ ///
+ /// assert!(exemplars_main.contains_char('a'));
+ /// assert!(exemplars_main.contains_char('z'));
+ /// assert!(exemplars_main.contains("a"));
+ /// assert!(!exemplars_main.contains("ä"));
+ /// assert!(!exemplars_main.contains("ng"));
+ /// assert!(!exemplars_main.contains("A"));
+ /// ```
+ pub fn exemplars_main();
+);
+
+make_exemplar_chars_unicode_set_property!(
+ marker: ExemplarCharactersAuxiliary;
+ keyed_data_marker: ExemplarCharactersAuxiliaryV1Marker;
+ func:
+ pub fn load_exemplars_auxiliary();
+
+ /// Get the "auxiliary" set of exemplar characters.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::locid::locale;
+ /// use icu::properties::exemplar_chars;
+ ///
+ /// let data =
+ /// exemplar_chars::exemplars_auxiliary(&locale!("en").into())
+ /// .expect("locale should be present");
+ /// let exemplars_auxiliary = data.as_borrowed();
+ ///
+ /// assert!(!exemplars_auxiliary.contains_char('a'));
+ /// assert!(!exemplars_auxiliary.contains_char('z'));
+ /// assert!(!exemplars_auxiliary.contains("a"));
+ /// assert!(exemplars_auxiliary.contains("ä"));
+ /// assert!(!exemplars_auxiliary.contains("ng"));
+ /// assert!(!exemplars_auxiliary.contains("A"));
+ /// ```
+ pub fn exemplars_auxiliary();
+);
+
+make_exemplar_chars_unicode_set_property!(
+ marker: ExemplarCharactersPunctuation;
+ keyed_data_marker: ExemplarCharactersPunctuationV1Marker;
+ func:
+ pub fn load_exemplars_punctuation();
+
+ /// Get the "punctuation" set of exemplar characters.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::locid::locale;
+ /// use icu::properties::exemplar_chars;
+ ///
+ /// let data =
+ /// exemplar_chars::exemplars_punctuation(&locale!("en").into())
+ /// .expect("locale should be present");
+ /// let exemplars_punctuation = data.as_borrowed();
+ ///
+ /// assert!(!exemplars_punctuation.contains_char('0'));
+ /// assert!(!exemplars_punctuation.contains_char('9'));
+ /// assert!(!exemplars_punctuation.contains_char('%'));
+ /// assert!(exemplars_punctuation.contains_char(','));
+ /// assert!(exemplars_punctuation.contains_char('.'));
+ /// assert!(exemplars_punctuation.contains_char('!'));
+ /// assert!(exemplars_punctuation.contains_char('?'));
+ /// ```
+ pub fn exemplars_punctuation();
+);
+
+make_exemplar_chars_unicode_set_property!(
+ marker: ExemplarCharactersNumbers;
+ keyed_data_marker: ExemplarCharactersNumbersV1Marker;
+ func:
+ pub fn load_exemplars_numbers();
+
+ /// Get the "numbers" set of exemplar characters.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::locid::locale;
+ /// use icu::properties::exemplar_chars;
+ ///
+ /// let data =
+ /// exemplar_chars::exemplars_numbers(&locale!("en").into())
+ /// .expect("locale should be present");
+ /// let exemplars_numbers = data.as_borrowed();
+ ///
+ /// assert!(exemplars_numbers.contains_char('0'));
+ /// assert!(exemplars_numbers.contains_char('9'));
+ /// assert!(exemplars_numbers.contains_char('%'));
+ /// assert!(exemplars_numbers.contains_char(','));
+ /// assert!(exemplars_numbers.contains_char('.'));
+ /// assert!(!exemplars_numbers.contains_char('!'));
+ /// assert!(!exemplars_numbers.contains_char('?'));
+ /// ```
+ pub fn exemplars_numbers();
+);
+
+make_exemplar_chars_unicode_set_property!(
+ marker: ExemplarCharactersIndex;
+ keyed_data_marker: ExemplarCharactersIndexV1Marker;
+ func:
+ pub fn load_exemplars_index();
+
+ /// Get the "index" set of exemplar characters.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::locid::locale;
+ /// use icu::properties::exemplar_chars;
+ ///
+ /// let data =
+ /// exemplar_chars::exemplars_index(&locale!("en").into())
+ /// .expect("locale should be present");
+ /// let exemplars_index = data.as_borrowed();
+ ///
+ /// assert!(!exemplars_index.contains_char('a'));
+ /// assert!(!exemplars_index.contains_char('z'));
+ /// assert!(!exemplars_index.contains("a"));
+ /// assert!(!exemplars_index.contains("ä"));
+ /// assert!(!exemplars_index.contains("ng"));
+ /// assert!(exemplars_index.contains("A"));
+ /// ```
+ pub fn exemplars_index();
+);
diff --git a/third_party/rust/icu_properties/src/lib.rs b/third_party/rust/icu_properties/src/lib.rs
new file mode 100644
index 0000000000..3b9a236e23
--- /dev/null
+++ b/third_party/rust/icu_properties/src/lib.rs
@@ -0,0 +1,115 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Definitions of [Unicode Properties] and APIs for
+//! retrieving property data in an appropriate data structure.
+//!
+//! This module is published as its own crate ([`icu_properties`](https://docs.rs/icu_properties/latest/icu_properties/))
+//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
+//!
+//! APIs that return a [`CodePointSetData`] exist for binary properties and certain enumerated
+//! properties. See the [`sets`] module for more details.
+//!
+//! APIs that return a [`CodePointMapData`] exist for certain enumerated properties. See the
+//! [`maps`] module for more details.
+//!
+//! # Examples
+//!
+//! ## Property data as `CodePointSetData`s
+//!
+//! ```
+//! use icu::properties::{maps, sets, GeneralCategory};
+//!
+//! // A binary property as a `CodePointSetData`
+//!
+//! assert!(sets::emoji().contains('🎃')); // U+1F383 JACK-O-LANTERN
+//! assert!(!sets::emoji().contains('木')); // U+6728
+//!
+//! // An individual enumerated property value as a `CodePointSetData`
+//!
+//! let line_sep_data = maps::general_category()
+//! .get_set_for_value(GeneralCategory::LineSeparator);
+//! let line_sep = line_sep_data.as_borrowed();
+//!
+//! assert!(line_sep.contains32(0x2028));
+//! assert!(!line_sep.contains32(0x2029));
+//! ```
+//!
+//! ## Property data as `CodePointMapData`s
+//!
+//! ```
+//! use icu::properties::{maps, Script};
+//!
+//! assert_eq!(maps::script().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
+//! assert_eq!(maps::script().get('木'), Script::Han); // U+6728
+//! ```
+//!
+//! [`ICU4X`]: ../icu/index.html
+//! [Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html
+//! [`CodePointSetData`]: crate::sets::CodePointSetData
+//! [`CodePointMapData`]: crate::maps::CodePointMapData
+//! [`sets`]: crate::sets
+
+// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations
+#![cfg_attr(not(any(test, feature = "std")), no_std)]
+#![cfg_attr(
+ not(test),
+ deny(
+ clippy::indexing_slicing,
+ clippy::unwrap_used,
+ clippy::expect_used,
+ clippy::panic,
+ clippy::exhaustive_structs,
+ clippy::exhaustive_enums,
+ missing_debug_implementations,
+ )
+)]
+#![warn(missing_docs)]
+
+extern crate alloc;
+
+#[cfg(feature = "bidi")]
+pub mod bidi;
+
+mod error;
+pub mod maps;
+
+// NOTE: The Pernosco debugger has special knowledge
+// of the `CanonicalCombiningClass` struct inside the `props`
+// module. Please do not change the crate-module-qualified
+// name of that struct without coordination.
+mod props;
+
+pub mod bidi_data;
+pub mod exemplar_chars;
+pub mod provider;
+pub(crate) mod runtime;
+#[allow(clippy::exhaustive_structs)] // TODO
+pub mod script;
+pub mod sets;
+mod trievalue;
+
+pub use props::{
+ BidiClass, CanonicalCombiningClass, EastAsianWidth, GeneralCategory, GeneralCategoryGroup,
+ GraphemeClusterBreak, IndicSyllabicCategory, LineBreak, Script, SentenceBreak, WordBreak,
+};
+
+/// Module for working with the names of property values
+pub mod names {
+ pub use crate::props::{
+ PropertyEnumToValueNameLinearMapper, PropertyEnumToValueNameLinearMapperBorrowed,
+ };
+ pub use crate::props::{
+ PropertyEnumToValueNameLinearTiny4Mapper, PropertyEnumToValueNameLinearTiny4MapperBorrowed,
+ };
+ pub use crate::props::{
+ PropertyEnumToValueNameSparseMapper, PropertyEnumToValueNameSparseMapperBorrowed,
+ };
+ pub use crate::props::{PropertyValueNameToEnumMapper, PropertyValueNameToEnumMapperBorrowed};
+}
+
+pub use error::PropertiesError;
+
+#[doc(no_inline)]
+pub use PropertiesError as Error;
diff --git a/third_party/rust/icu_properties/src/maps.rs b/third_party/rust/icu_properties/src/maps.rs
new file mode 100644
index 0000000000..478ef5f2c1
--- /dev/null
+++ b/third_party/rust/icu_properties/src/maps.rs
@@ -0,0 +1,602 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! The functions in this module return a [`CodePointMapData`] representing, for
+//! each code point in the entire range of code points, the property values
+//! for a particular Unicode property.
+//!
+//! The descriptions of most properties are taken from [`TR44`], the documentation for the
+//! Unicode Character Database.
+//!
+//! [`TR44`]: https://www.unicode.org/reports/tr44
+
+use crate::error::PropertiesError;
+use crate::provider::*;
+use crate::sets::CodePointSetData;
+#[cfg(doc)]
+use crate::*;
+use core::marker::PhantomData;
+use core::ops::RangeInclusive;
+use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
+use icu_provider::prelude::*;
+use zerovec::ZeroVecError;
+
+/// A wrapper around code point map data. It is returned by APIs that return Unicode
+/// property data in a map-like form, ex: enumerated property value data keyed
+/// by code point. Access its data via the borrowed version,
+/// [`CodePointMapDataBorrowed`].
+#[derive(Debug, Clone)]
+pub struct CodePointMapData<T: TrieValue> {
+ data: DataPayload<ErasedMaplikeMarker<T>>,
+}
+
+/// Private marker type for CodePointMapData
+/// to work for all same-value map properties at once
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+struct ErasedMaplikeMarker<T>(PhantomData<T>);
+impl<T: TrieValue> DataMarker for ErasedMaplikeMarker<T> {
+ type Yokeable = PropertyCodePointMapV1<'static, T>;
+}
+
+impl<T: TrieValue> CodePointMapData<T> {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
+ /// up front.
+ ///
+ /// This owned version if returned by functions that use a runtime data provider.
+ #[inline]
+ pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
+ CodePointMapDataBorrowed {
+ map: self.data.get(),
+ }
+ }
+
+ /// Convert this map to a map around another type
+ ///
+ /// Typically useful for type-erasing maps into maps around integers.
+ ///
+ /// # Panics
+ /// Will panic if T and P are different sizes
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, GeneralCategory};
+ ///
+ /// let data = maps::general_category().static_to_owned();
+ ///
+ /// let gc = data.try_into_converted::<u8>().unwrap();
+ /// let gc = gc.as_borrowed();
+ ///
+ /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728
+ /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
+ /// ```
+ pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, ZeroVecError>
+ where
+ P: TrieValue,
+ {
+ self.data
+ .try_map_project::<ErasedMaplikeMarker<P>, _, _>(move |data, _| {
+ data.try_into_converted()
+ })
+ .map(CodePointMapData::from_data)
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use getters like [`load_general_category()`] instead
+ pub fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyCodePointMapV1<'static, T>>,
+ {
+ Self { data: data.cast() }
+ }
+
+ /// Construct a new one an owned [`CodePointTrie`]
+ pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
+ let set = PropertyCodePointMapV1::from_code_point_trie(trie);
+ CodePointMapData::from_data(DataPayload::<ErasedMaplikeMarker<T>>::from_owned(set))
+ }
+
+ /// Convert this type to a [`CodePointTrie`] as a borrowed value.
+ ///
+ /// The data backing this is extensible and supports multiple implementations.
+ /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
+ /// added, and users may select which at data generation time.
+ ///
+ /// This method returns an `Option` in order to return `None` when the backing data provider
+ /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
+ /// constraint.
+ pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
+ self.data.get().as_code_point_trie()
+ }
+
+ /// Convert this type to a [`CodePointTrie`], borrowing if possible,
+ /// otherwise allocating a new [`CodePointTrie`].
+ ///
+ /// The data backing this is extensible and supports multiple implementations.
+ /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
+ /// added, and users may select which at data generation time.
+ ///
+ /// The performance of the conversion to this specific return type will vary
+ /// depending on the data structure that is backing `self`.
+ pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
+ self.data.get().to_code_point_trie()
+ }
+}
+
+/// A borrowed wrapper around code point set data, returned by
+/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
+#[derive(Clone, Copy, Debug)]
+pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
+ map: &'a PropertyCodePointMapV1<'a, T>,
+}
+
+impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
+ /// Get the value this map has associated with code point `ch`
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, GeneralCategory};
+ ///
+ /// let gc = maps::general_category();
+ ///
+ /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
+ /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
+ /// ```
+ pub fn get(self, ch: char) -> T {
+ self.map.get32(ch as u32)
+ }
+
+ /// Get the value this map has associated with code point `ch`
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, GeneralCategory};
+ ///
+ /// let gc = maps::general_category();
+ ///
+ /// assert_eq!(gc.get32(0x6728), GeneralCategory::OtherLetter); // U+6728 (木)
+ /// assert_eq!(gc.get32(0x1F383), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
+ /// ```
+ pub fn get32(self, ch: u32) -> T {
+ self.map.get32(ch)
+ }
+
+ /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, GeneralCategory};
+ ///
+ /// let gc = maps::general_category();
+ ///
+ /// let other_letter_set_data =
+ /// gc.get_set_for_value(GeneralCategory::OtherLetter);
+ /// let other_letter_set = other_letter_set_data.as_borrowed();
+ ///
+ /// assert!(other_letter_set.contains('木')); // U+6728
+ /// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
+ /// ```
+ pub fn get_set_for_value(self, value: T) -> CodePointSetData {
+ let set = self.map.get_set_for_value(value);
+ CodePointSetData::from_code_point_inversion_list(set)
+ }
+
+ /// Yields an [`Iterator`] returning ranges of consecutive code points that
+ /// share the same value in the [`CodePointMapData`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use core::ops::RangeInclusive;
+ /// use icu::properties::maps::{self, CodePointMapData};
+ /// use icu::properties::GeneralCategory;
+ ///
+ /// let gc = maps::general_category();
+ /// let mut ranges = gc.iter_ranges();
+ /// let next = ranges.next().unwrap();
+ /// assert_eq!(next.range, 0..=31);
+ /// assert_eq!(next.value, GeneralCategory::Control);
+ /// let next = ranges.next().unwrap();
+ /// assert_eq!(next.range, 32..=32);
+ /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
+ /// ```
+ pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
+ self.map.iter_ranges()
+ }
+
+ /// Yields an [`Iterator`] returning ranges of consecutive code points that
+ /// share the same value `v` in the [`CodePointMapData`].
+ ///
+ /// # Examples
+ ///
+ ///
+ /// ```
+ /// use core::ops::RangeInclusive;
+ /// use icu::properties::maps::{self, CodePointMapData};
+ /// use icu::properties::GeneralCategory;
+ ///
+ /// let gc = maps::general_category();
+ /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
+ /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
+ /// ```
+ pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
+ self.map
+ .iter_ranges()
+ .filter(move |r| r.value == val)
+ .map(|r| r.range)
+ }
+
+ /// Yields an [`Iterator`] returning ranges of consecutive code points that
+ /// do *not* have the value `v` in the [`CodePointMapData`].
+ pub fn iter_ranges_for_value_complemented(
+ self,
+ val: T,
+ ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
+ self.map
+ .iter_ranges_mapped(move |value| value != val)
+ .filter(|v| v.value)
+ .map(|v| v.range)
+ }
+
+ /// Exposed for FFI needs, could be exposed in general in the future but we should
+ /// have a use case first.
+ ///
+ /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
+ #[doc(hidden)]
+ pub fn iter_ranges_mapped<U: Eq + 'a>(
+ self,
+ predicate: impl FnMut(T) -> U + Copy + 'a,
+ ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
+ self.map.iter_ranges_mapped(predicate)
+ }
+}
+
+impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
+ /// Cheaply converts a `CodePointMapDataBorrowed<'static>` into a `CodePointMapData`.
+ pub const fn static_to_owned(self) -> CodePointMapData<T> {
+ CodePointMapData {
+ data: DataPayload::from_static_ref(self.map),
+ }
+ }
+}
+
+impl<'a> CodePointMapDataBorrowed<'a, crate::GeneralCategory> {
+ /// Yields an [`Iterator`] returning ranges of consecutive code points that
+ /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
+ ///
+ /// # Examples
+ ///
+ ///
+ /// ```
+ /// use core::ops::RangeInclusive;
+ /// use icu::properties::maps::{self, CodePointMapData};
+ /// use icu::properties::GeneralCategoryGroup;
+ ///
+ /// let gc = maps::general_category();
+ /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
+ /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
+ /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32);
+ /// ```
+ pub fn iter_ranges_for_group(
+ self,
+ group: crate::GeneralCategoryGroup,
+ ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
+ self.map
+ .iter_ranges_mapped(move |value| group.contains(value))
+ .filter(|v| v.value)
+ .map(|v| v.range)
+ }
+}
+
+macro_rules! make_map_property {
+ (
+ // currently unused
+ property: $prop_name:expr;
+ // currently unused
+ marker: $marker_name:ident;
+ value: $value_ty:path;
+ keyed_data_marker: $keyed_data_marker:ty;
+ func:
+ $(#[$doc:meta])*
+ $vis2:vis const $constname:ident => $singleton:ident;
+ $vis:vis fn $name:ident();
+ ) => {
+ #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")]
+ ///
+ /// Note that this will return an owned version of the data. Functionality is available on
+ /// the borrowed version, accessible through [`CodePointMapData::as_borrowed`].
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ $vis fn $name(
+ provider: &(impl DataProvider<$keyed_data_marker> + ?Sized)
+ ) -> Result<CodePointMapData<$value_ty>, PropertiesError> {
+ Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(CodePointMapData::from_data)?)
+ }
+ $(#[$doc])*
+ #[cfg(feature = "compiled_data")]
+ pub const fn $constname() -> CodePointMapDataBorrowed<'static, $value_ty> {
+ CodePointMapDataBorrowed {
+ map: crate::provider::Baked::$singleton
+ }
+ }
+ };
+}
+
+make_map_property! {
+ property: "General_Category";
+ marker: GeneralCategoryProperty;
+ value: crate::GeneralCategory;
+ keyed_data_marker: GeneralCategoryV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the General_Category Unicode enumerated property. See [`GeneralCategory`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, GeneralCategory};
+ ///
+ /// assert_eq!(maps::general_category().get('木'), GeneralCategory::OtherLetter); // U+6728
+ /// assert_eq!(maps::general_category().get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
+ /// ```
+ pub const general_category => SINGLETON_PROPS_GC_V1;
+ pub fn load_general_category();
+}
+
+make_map_property! {
+ property: "Bidi_Class";
+ marker: BidiClassProperty;
+ value: crate::BidiClass;
+ keyed_data_marker: BidiClassV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the Bidi_Class Unicode enumerated property. See [`BidiClass`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, BidiClass};
+ ///
+ /// assert_eq!(maps::bidi_class().get('y'), BidiClass::LeftToRight); // U+0079
+ /// assert_eq!(maps::bidi_class().get('ع'), BidiClass::ArabicLetter); // U+0639
+ /// ```
+ pub const bidi_class => SINGLETON_PROPS_BC_V1;
+ pub fn load_bidi_class();
+}
+
+make_map_property! {
+ property: "Script";
+ marker: ScriptProperty;
+ value: crate::Script;
+ keyed_data_marker: ScriptV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the Script Unicode enumerated property. See [`Script`].
+ ///
+ /// **Note:** Some code points are associated with multiple scripts. If you are trying to
+ /// determine whether a code point belongs to a certain script, you should use
+ /// [`load_script_with_extensions_unstable`] and [`ScriptWithExtensionsBorrowed::has_script`]
+ /// instead of this function.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, Script};
+ ///
+ /// assert_eq!(maps::script().get('木'), Script::Han); // U+6728
+ /// assert_eq!(maps::script().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
+ /// ```
+ /// [`load_script_with_extensions_unstable`]: crate::script::load_script_with_extensions_unstable
+ /// [`ScriptWithExtensionsBorrowed::has_script`]: crate::script::ScriptWithExtensionsBorrowed::has_script
+ pub const script => SINGLETON_PROPS_SC_V1;
+ pub fn load_script();
+}
+
+make_map_property! {
+ property: "East_Asian_Width";
+ marker: EastAsianWidthProperty;
+ value: crate::EastAsianWidth;
+ keyed_data_marker: EastAsianWidthV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the East_Asian_Width Unicode enumerated
+ /// property. See [`EastAsianWidth`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, EastAsianWidth};
+ ///
+ /// assert_eq!(maps::east_asian_width().get('ア'), EastAsianWidth::Halfwidth); // U+FF71: Halfwidth Katakana Letter A
+ /// assert_eq!(maps::east_asian_width().get('ア'), EastAsianWidth::Wide); //U+30A2: Katakana Letter A
+ /// ```
+ pub const east_asian_width => SINGLETON_PROPS_EA_V1;
+ pub fn load_east_asian_width();
+}
+
+make_map_property! {
+ property: "Line_Break";
+ marker: LineBreakProperty;
+ value: crate::LineBreak;
+ keyed_data_marker: LineBreakV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the Line_Break Unicode enumerated
+ /// property. See [`LineBreak`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, LineBreak};
+ ///
+ /// assert_eq!(maps::line_break().get(')'), LineBreak::CloseParenthesis); // U+0029: Right Parenthesis
+ /// assert_eq!(maps::line_break().get('ぁ'), LineBreak::ConditionalJapaneseStarter); //U+3041: Hiragana Letter Small A
+ /// ```
+ pub const line_break => SINGLETON_PROPS_LB_V1;
+ pub fn load_line_break();
+}
+
+make_map_property! {
+ property: "Grapheme_Cluster_Break";
+ marker: GraphemeClusterBreakProperty;
+ value: crate::GraphemeClusterBreak;
+ keyed_data_marker: GraphemeClusterBreakV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the Grapheme_Cluster_Break Unicode enumerated
+ /// property. See [`GraphemeClusterBreak`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, GraphemeClusterBreak};
+ ///
+ /// assert_eq!(maps::grapheme_cluster_break().get('🇦'), GraphemeClusterBreak::RegionalIndicator); // U+1F1E6: Regional Indicator Symbol Letter A
+ /// assert_eq!(maps::grapheme_cluster_break().get('ำ'), GraphemeClusterBreak::SpacingMark); //U+0E33: Thai Character Sara Am
+ /// ```
+ pub const grapheme_cluster_break => SINGLETON_PROPS_GCB_V1;
+ pub fn load_grapheme_cluster_break();
+}
+
+make_map_property! {
+ property: "Word_Break";
+ marker: WordBreakProperty;
+ value: crate::WordBreak;
+ keyed_data_marker: WordBreakV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the Word_Break Unicode enumerated
+ /// property. See [`WordBreak`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, WordBreak};
+ ///
+ /// assert_eq!(maps::word_break().get('.'), WordBreak::MidNumLet); // U+002E: Full Stop
+ /// assert_eq!(maps::word_break().get(','), WordBreak::MidNum); // U+FF0C: Fullwidth Comma
+ /// ```
+ pub const word_break => SINGLETON_PROPS_WB_V1;
+ pub fn load_word_break();
+}
+
+make_map_property! {
+ property: "Sentence_Break";
+ marker: SentenceBreakProperty;
+ value: crate::SentenceBreak;
+ keyed_data_marker: SentenceBreakV1Marker;
+ func:
+ /// Return a [`CodePointMapDataBorrowed`] for the Sentence_Break Unicode enumerated
+ /// property. See [`SentenceBreak`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, SentenceBreak};
+ ///
+ /// assert_eq!(maps::sentence_break().get('9'), SentenceBreak::Numeric); // U+FF19: Fullwidth Digit Nine
+ /// assert_eq!(maps::sentence_break().get(','), SentenceBreak::SContinue); // U+002C: Comma
+ /// ```
+ pub const sentence_break => SINGLETON_PROPS_SB_V1;
+ pub fn load_sentence_break();
+}
+
+make_map_property! {
+ property: "Canonical_Combining_Class";
+ marker: CanonicalCombiningClassProperty;
+ value: crate::CanonicalCombiningClass;
+ keyed_data_marker: CanonicalCombiningClassV1Marker;
+ func:
+ /// Return a [`CodePointMapData`] for the Canonical_Combining_Class Unicode property. See
+ /// [`CanonicalCombiningClass`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// **Note:** See `icu_normalizer::CanonicalCombiningClassMap` for the preferred API
+ /// to look up the Canonical_Combining_Class property by scalar value.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, CanonicalCombiningClass};
+ ///
+ /// assert_eq!(maps::canonical_combining_class().get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
+ /// assert_eq!(maps::canonical_combining_class().get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
+ /// ```
+ pub const canonical_combining_class => SINGLETON_PROPS_CCC_V1;
+ pub fn load_canonical_combining_class();
+}
+
+make_map_property! {
+ property: "Indic_Syllabic_Category";
+ marker: IndicSyllabicCategoryProperty;
+ value: crate::IndicSyllabicCategory;
+ keyed_data_marker: IndicSyllabicCategoryV1Marker;
+ func:
+ /// Return a [`CodePointMapData`] for the Indic_Syllabic_Category Unicode property. See
+ /// [`IndicSyllabicCategory`].
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{maps, IndicSyllabicCategory};
+ ///
+ /// assert_eq!(maps::indic_syllabic_category().get('a'), IndicSyllabicCategory::Other);
+ /// assert_eq!(maps::indic_syllabic_category().get32(0x0900), IndicSyllabicCategory::Bindu); // U+0900: DEVANAGARI SIGN INVERTED CANDRABINDU
+ /// ```
+ pub const indic_syllabic_category => SINGLETON_PROPS_INSC_V1;
+ pub fn load_indic_syllabic_category();
+}
diff --git a/third_party/rust/icu_properties/src/props.rs b/third_party/rust/icu_properties/src/props.rs
new file mode 100644
index 0000000000..247b505c81
--- /dev/null
+++ b/third_party/rust/icu_properties/src/props.rs
@@ -0,0 +1,2365 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! A collection of property definitions shared across contexts
+//! (ex: representing trie values).
+//!
+//! This module defines enums / newtypes for enumerated properties.
+//! String properties are represented as newtypes if their
+//! values represent code points.
+
+use crate::provider::{names::*, *};
+use crate::PropertiesError;
+use core::marker::PhantomData;
+use icu_collections::codepointtrie::TrieValue;
+use icu_provider::prelude::*;
+use zerovec::ule::VarULE;
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+/// Private marker type for PropertyValueNameToEnumMapper
+/// to work for all properties at once
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ErasedNameToEnumMapV1Marker;
+impl DataMarker for ErasedNameToEnumMapV1Marker {
+ type Yokeable = PropertyValueNameToEnumMapV1<'static>;
+}
+
+/// A struct capable of looking up a property value from a string name.
+/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
+/// [`PropertyValueNameToEnumMapperBorrowed`].
+///
+/// The name can be a short name (`Lu`), a long name(`Uppercase_Letter`),
+/// or an alias.
+///
+/// Property names can be looked up using "strict" matching (looking for a name
+/// that matches exactly), or "loose matching", where the name is allowed to deviate
+/// in terms of ASCII casing, whitespace, underscores, and hyphens.
+///
+/// # Example
+///
+/// ```
+/// use icu::properties::GeneralCategory;
+///
+/// let lookup = GeneralCategory::name_to_enum_mapper();
+/// // short name for value
+/// assert_eq!(
+/// lookup.get_strict("Lu"),
+/// Some(GeneralCategory::UppercaseLetter)
+/// );
+/// assert_eq!(
+/// lookup.get_strict("Pd"),
+/// Some(GeneralCategory::DashPunctuation)
+/// );
+/// // long name for value
+/// assert_eq!(
+/// lookup.get_strict("Uppercase_Letter"),
+/// Some(GeneralCategory::UppercaseLetter)
+/// );
+/// assert_eq!(
+/// lookup.get_strict("Dash_Punctuation"),
+/// Some(GeneralCategory::DashPunctuation)
+/// );
+/// // name has incorrect casing
+/// assert_eq!(lookup.get_strict("dashpunctuation"), None);
+/// // loose matching of name
+/// assert_eq!(
+/// lookup.get_loose("dash-punctuation"),
+/// Some(GeneralCategory::DashPunctuation)
+/// );
+/// // fake property
+/// assert_eq!(lookup.get_strict("Animated_Gif"), None);
+/// ```
+#[derive(Debug)]
+pub struct PropertyValueNameToEnumMapper<T> {
+ map: DataPayload<ErasedNameToEnumMapV1Marker>,
+ markers: PhantomData<fn() -> T>,
+}
+
+/// A borrowed wrapper around property value name-to-enum data, returned by
+/// [`PropertyValueNameToEnumMapper::as_borrowed()`]. More efficient to query.
+#[derive(Debug)]
+pub struct PropertyValueNameToEnumMapperBorrowed<'a, T> {
+ map: &'a PropertyValueNameToEnumMapV1<'a>,
+ markers: PhantomData<fn() -> T>,
+}
+
+impl<T: TrieValue> PropertyValueNameToEnumMapper<T> {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (like `get_strict()`) by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> PropertyValueNameToEnumMapperBorrowed<'_, T> {
+ PropertyValueNameToEnumMapperBorrowed {
+ map: self.map.get(),
+ markers: PhantomData,
+ }
+ }
+
+ pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyValueNameToEnumMapV1<'static>>,
+ {
+ Self {
+ map: data.cast(),
+ markers: PhantomData,
+ }
+ }
+
+ #[doc(hidden)] // used by FFI code
+ pub fn erase(self) -> PropertyValueNameToEnumMapper<u16> {
+ PropertyValueNameToEnumMapper {
+ map: self.map.cast(),
+ markers: PhantomData,
+ }
+ }
+}
+
+impl<T: TrieValue> PropertyValueNameToEnumMapperBorrowed<'_, T> {
+ /// Get the property value as a u16, doing a strict search looking for
+ /// names that match exactly
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::name_to_enum_mapper();
+ /// assert_eq!(
+ /// lookup.get_strict_u16("Lu"),
+ /// Some(GeneralCategory::UppercaseLetter as u16)
+ /// );
+ /// assert_eq!(
+ /// lookup.get_strict_u16("Uppercase_Letter"),
+ /// Some(GeneralCategory::UppercaseLetter as u16)
+ /// );
+ /// // does not do loose matching
+ /// assert_eq!(lookup.get_strict_u16("UppercaseLetter"), None);
+ /// ```
+ #[inline]
+ pub fn get_strict_u16(&self, name: &str) -> Option<u16> {
+ get_strict_u16(self.map, name)
+ }
+
+ /// Get the property value as a `T`, doing a strict search looking for
+ /// names that match exactly
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::name_to_enum_mapper();
+ /// assert_eq!(
+ /// lookup.get_strict("Lu"),
+ /// Some(GeneralCategory::UppercaseLetter)
+ /// );
+ /// assert_eq!(
+ /// lookup.get_strict("Uppercase_Letter"),
+ /// Some(GeneralCategory::UppercaseLetter)
+ /// );
+ /// // does not do loose matching
+ /// assert_eq!(lookup.get_strict("UppercaseLetter"), None);
+ /// ```
+ #[inline]
+ pub fn get_strict(&self, name: &str) -> Option<T> {
+ T::try_from_u32(self.get_strict_u16(name)? as u32).ok()
+ }
+
+ /// Get the property value as a u16, doing a loose search looking for
+ /// names that match case-insensitively, ignoring ASCII hyphens, underscores, and
+ /// whitespaces.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::name_to_enum_mapper();
+ /// assert_eq!(
+ /// lookup.get_loose_u16("Lu"),
+ /// Some(GeneralCategory::UppercaseLetter as u16)
+ /// );
+ /// assert_eq!(
+ /// lookup.get_loose_u16("Uppercase_Letter"),
+ /// Some(GeneralCategory::UppercaseLetter as u16)
+ /// );
+ /// // does do loose matching
+ /// assert_eq!(
+ /// lookup.get_loose_u16("UppercaseLetter"),
+ /// Some(GeneralCategory::UppercaseLetter as u16)
+ /// );
+ /// ```
+ #[inline]
+ pub fn get_loose_u16(&self, name: &str) -> Option<u16> {
+ get_loose_u16(self.map, name)
+ }
+
+ /// Get the property value as a `T`, doing a loose search looking for
+ /// names that match case-insensitively, ignoring ASCII hyphens, underscores, and
+ /// whitespaces.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::name_to_enum_mapper();
+ /// assert_eq!(
+ /// lookup.get_loose("Lu"),
+ /// Some(GeneralCategory::UppercaseLetter)
+ /// );
+ /// assert_eq!(
+ /// lookup.get_loose("Uppercase_Letter"),
+ /// Some(GeneralCategory::UppercaseLetter)
+ /// );
+ /// // does do loose matching
+ /// assert_eq!(
+ /// lookup.get_loose("UppercaseLetter"),
+ /// Some(GeneralCategory::UppercaseLetter)
+ /// );
+ /// ```
+ #[inline]
+ pub fn get_loose(&self, name: &str) -> Option<T> {
+ T::try_from_u32(self.get_loose_u16(name)? as u32).ok()
+ }
+}
+
+impl<T: TrieValue> PropertyValueNameToEnumMapperBorrowed<'static, T> {
+ /// Cheaply converts a `PropertyValueNameToEnumMapperBorrowed<'static>` into a `PropertyValueNameToEnumMapper`.
+ pub const fn static_to_owned(self) -> PropertyValueNameToEnumMapper<T> {
+ PropertyValueNameToEnumMapper {
+ map: DataPayload::from_static_ref(self.map),
+ markers: PhantomData,
+ }
+ }
+}
+
+/// Avoid monomorphizing multiple copies of this function
+fn get_strict_u16(payload: &PropertyValueNameToEnumMapV1<'_>, name: &str) -> Option<u16> {
+ // NormalizedPropertyName has no invariants so this should be free, but
+ // avoid introducing a panic regardless
+ let name = NormalizedPropertyNameStr::parse_byte_slice(name.as_bytes()).ok()?;
+ payload.map.get_copied(name)
+}
+
+/// Avoid monomorphizing multiple copies of this function
+fn get_loose_u16(payload: &PropertyValueNameToEnumMapV1<'_>, name: &str) -> Option<u16> {
+ // NormalizedPropertyName has no invariants so this should be free, but
+ // avoid introducing a panic regardless
+ let name = NormalizedPropertyNameStr::parse_byte_slice(name.as_bytes()).ok()?;
+ payload.map.get_copied_by(|p| p.cmp_loose(name))
+}
+
+/// Private marker type for PropertyEnumToValueNameSparseMapper
+/// to work for all properties at once
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ErasedEnumToValueNameSparseMapV1Marker;
+impl DataMarker for ErasedEnumToValueNameSparseMapV1Marker {
+ type Yokeable = PropertyEnumToValueNameSparseMapV1<'static>;
+}
+
+/// A struct capable of looking up a property name from a value
+/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
+/// [`PropertyEnumToValueNameSparseMapperBorrowed`].
+///
+/// This mapper is used for properties with sparse values, like [`CanonicalCombiningClass`].
+/// It may be obtained using methods like [`CanonicalCombiningClass::get_enum_to_long_name_mapper()`].
+///
+/// The name returned may be a short (`"KV"`) or long (`"Kana_Voicing"`) name, depending
+/// on the constructor used.
+///
+/// # Example
+///
+/// ```
+/// use icu::properties::CanonicalCombiningClass;
+///
+/// let lookup = CanonicalCombiningClass::enum_to_long_name_mapper();
+/// assert_eq!(
+/// lookup.get(CanonicalCombiningClass::KanaVoicing),
+/// Some("Kana_Voicing")
+/// );
+/// assert_eq!(
+/// lookup.get(CanonicalCombiningClass::AboveLeft),
+/// Some("Above_Left")
+/// );
+/// ```
+#[derive(Debug)]
+pub struct PropertyEnumToValueNameSparseMapper<T> {
+ map: DataPayload<ErasedEnumToValueNameSparseMapV1Marker>,
+ markers: PhantomData<fn(T) -> ()>,
+}
+
+/// A borrowed wrapper around property value name-to-enum data, returned by
+/// [`PropertyEnumToValueNameSparseMapper::as_borrowed()`]. More efficient to query.
+#[derive(Debug)]
+pub struct PropertyEnumToValueNameSparseMapperBorrowed<'a, T> {
+ map: &'a PropertyEnumToValueNameSparseMapV1<'a>,
+ markers: PhantomData<fn(T) -> ()>,
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameSparseMapper<T> {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> PropertyEnumToValueNameSparseMapperBorrowed<'_, T> {
+ PropertyEnumToValueNameSparseMapperBorrowed {
+ map: self.map.get(),
+ markers: PhantomData,
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use methods on individual property value types
+ /// (like [`Script::TBD()`]) instead.
+ pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyEnumToValueNameSparseMapV1<'static>>,
+ {
+ Self {
+ map: data.cast(),
+ markers: PhantomData,
+ }
+ }
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameSparseMapperBorrowed<'_, T> {
+ /// Get the property name given a value
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::CanonicalCombiningClass;
+ ///
+ /// let lookup = CanonicalCombiningClass::enum_to_long_name_mapper();
+ /// assert_eq!(
+ /// lookup.get(CanonicalCombiningClass::KanaVoicing),
+ /// Some("Kana_Voicing")
+ /// );
+ /// assert_eq!(
+ /// lookup.get(CanonicalCombiningClass::AboveLeft),
+ /// Some("Above_Left")
+ /// );
+ /// ```
+ #[inline]
+ pub fn get(&self, property: T) -> Option<&str> {
+ let prop = u16::try_from(property.to_u32()).ok()?;
+ self.map.map.get(&prop)
+ }
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameSparseMapperBorrowed<'static, T> {
+ /// Cheaply converts a `PropertyEnumToValueNameSparseMapperBorrowed<'static>` into a `PropertyEnumToValueNameSparseMapper`.
+ pub const fn static_to_owned(self) -> PropertyEnumToValueNameSparseMapper<T> {
+ PropertyEnumToValueNameSparseMapper {
+ map: DataPayload::from_static_ref(self.map),
+ markers: PhantomData,
+ }
+ }
+}
+
+/// Private marker type for PropertyEnumToValueNameLinearMapper
+/// to work for all properties at once
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ErasedEnumToValueNameLinearMapV1Marker;
+impl DataMarker for ErasedEnumToValueNameLinearMapV1Marker {
+ type Yokeable = PropertyEnumToValueNameLinearMapV1<'static>;
+}
+
+/// A struct capable of looking up a property name from a value
+/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
+/// [`PropertyEnumToValueNameLinearMapperBorrowed`].
+///
+/// This mapper is used for properties with sequential values, like [`GeneralCategory`].
+/// It may be obtained using methods like [`GeneralCategory::get_enum_to_long_name_mapper()`].
+///
+/// The name returned may be a short (`"Lu"`) or long (`"Uppercase_Letter"`) name, depending
+/// on the constructor used.
+///
+/// # Example
+///
+/// ```
+/// use icu::properties::GeneralCategory;
+///
+/// let lookup = GeneralCategory::enum_to_long_name_mapper();
+/// assert_eq!(
+/// lookup.get(GeneralCategory::UppercaseLetter),
+/// Some("Uppercase_Letter")
+/// );
+/// assert_eq!(
+/// lookup.get(GeneralCategory::DashPunctuation),
+/// Some("Dash_Punctuation")
+/// );
+/// ```
+#[derive(Debug)]
+pub struct PropertyEnumToValueNameLinearMapper<T> {
+ map: DataPayload<ErasedEnumToValueNameLinearMapV1Marker>,
+ markers: PhantomData<fn(T) -> ()>,
+}
+
+/// A borrowed wrapper around property value name-to-enum data, returned by
+/// [`PropertyEnumToValueNameLinearMapper::as_borrowed()`]. More efficient to query.
+#[derive(Debug)]
+pub struct PropertyEnumToValueNameLinearMapperBorrowed<'a, T> {
+ map: &'a PropertyEnumToValueNameLinearMapV1<'a>,
+ markers: PhantomData<fn(T) -> ()>,
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameLinearMapper<T> {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> PropertyEnumToValueNameLinearMapperBorrowed<'_, T> {
+ PropertyEnumToValueNameLinearMapperBorrowed {
+ map: self.map.get(),
+ markers: PhantomData,
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use methods on individual property value types
+ /// (like [`Script::TBD()`]) instead.
+ pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyEnumToValueNameLinearMapV1<'static>>,
+ {
+ Self {
+ map: data.cast(),
+ markers: PhantomData,
+ }
+ }
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameLinearMapperBorrowed<'_, T> {
+ /// Get the property name given a value
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(GeneralCategory::UppercaseLetter), Some("Lu"));
+ /// assert_eq!(lookup.get(GeneralCategory::DashPunctuation), Some("Pd"));
+ /// ```
+ #[inline]
+ pub fn get(&self, property: T) -> Option<&str> {
+ let prop = usize::try_from(property.to_u32()).ok()?;
+ self.map.map.get(prop).filter(|x| !x.is_empty())
+ }
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameLinearMapperBorrowed<'static, T> {
+ /// Cheaply converts a `PropertyEnumToValueNameLinearMapperBorrowed<'static>` into a `PropertyEnumToValueNameLinearMapper`.
+ pub const fn static_to_owned(self) -> PropertyEnumToValueNameLinearMapper<T> {
+ PropertyEnumToValueNameLinearMapper {
+ map: DataPayload::from_static_ref(self.map),
+ markers: PhantomData,
+ }
+ }
+}
+
+/// Private marker type for PropertyEnumToValueNameLinearTiny4Mapper
+/// to work for all properties at once
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ErasedEnumToValueNameLinearTiny4MapV1Marker;
+impl DataMarker for ErasedEnumToValueNameLinearTiny4MapV1Marker {
+ type Yokeable = PropertyEnumToValueNameLinearTiny4MapV1<'static>;
+}
+
+/// A struct capable of looking up a property name from a value
+/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
+/// [`PropertyEnumToValueNameLinearTiny4MapperBorrowed`].
+///
+/// This mapper is used for properties with sequential values and names with four or fewer characters,
+/// like the [`Script`] short names.
+/// It may be obtained using methods like [`Script::get_enum_to_short_name_mapper()`].
+///
+/// # Example
+///
+/// ```
+/// use icu::properties::Script;
+/// use tinystr::tinystr;
+///
+/// let lookup = Script::enum_to_short_name_mapper();
+/// assert_eq!(lookup.get(Script::Brahmi), Some(tinystr!(4, "Brah")));
+/// assert_eq!(lookup.get(Script::Hangul), Some(tinystr!(4, "Hang")));
+/// ```
+#[derive(Debug)]
+pub struct PropertyEnumToValueNameLinearTiny4Mapper<T> {
+ map: DataPayload<ErasedEnumToValueNameLinearTiny4MapV1Marker>,
+ markers: PhantomData<fn(T) -> ()>,
+}
+
+/// A borrowed wrapper around property value name-to-enum data, returned by
+/// [`PropertyEnumToValueNameLinearTiny4Mapper::as_borrowed()`]. More efficient to query.
+#[derive(Debug)]
+pub struct PropertyEnumToValueNameLinearTiny4MapperBorrowed<'a, T> {
+ map: &'a PropertyEnumToValueNameLinearTiny4MapV1<'a>,
+ markers: PhantomData<fn(T) -> ()>,
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameLinearTiny4Mapper<T> {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> PropertyEnumToValueNameLinearTiny4MapperBorrowed<'_, T> {
+ PropertyEnumToValueNameLinearTiny4MapperBorrowed {
+ map: self.map.get(),
+ markers: PhantomData,
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use methods on individual property value types
+ /// (like [`Script::TBD()`]) instead.
+ pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyEnumToValueNameLinearTiny4MapV1<'static>>,
+ {
+ Self {
+ map: data.cast(),
+ markers: PhantomData,
+ }
+ }
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameLinearTiny4MapperBorrowed<'_, T> {
+ /// Get the property name given a value
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::Script;
+ /// use tinystr::tinystr;
+ ///
+ /// let lookup = Script::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(Script::Brahmi), Some(tinystr!(4, "Brah")));
+ /// assert_eq!(lookup.get(Script::Hangul), Some(tinystr!(4, "Hang")));
+ /// ```
+ #[inline]
+ pub fn get(&self, property: T) -> Option<tinystr::TinyStr4> {
+ let prop = usize::try_from(property.to_u32()).ok()?;
+ self.map.map.get(prop).filter(|x| !x.is_empty())
+ }
+}
+
+impl<T: TrieValue> PropertyEnumToValueNameLinearTiny4MapperBorrowed<'static, T> {
+ /// Cheaply converts a `PropertyEnumToValueNameLinearTiny4MapperBorrowed<'static>` into a `PropertyEnumToValueNameLinearTiny4Mapper`.
+ pub const fn static_to_owned(self) -> PropertyEnumToValueNameLinearTiny4Mapper<T> {
+ PropertyEnumToValueNameLinearTiny4Mapper {
+ map: DataPayload::from_static_ref(self.map),
+ markers: PhantomData,
+ }
+ }
+}
+
+macro_rules! impl_value_getter {
+ (
+ // the marker type for names lookup (name_to_enum, enum_to_short_name, enum_to_long_name)
+ markers: $marker_n2e:ident / $singleton_n2e:ident $(, $marker_e2sn:ident / $singleton_e2sn:ident, $marker_e2ln:ident / $singleton_e2ln:ident)?;
+ impl $ty:ident {
+ $(#[$attr_n2e:meta])*
+ $vis_n2e:vis fn $name_n2e:ident() / $cname_n2e:ident();
+ $(
+
+ $(#[$attr_e2sn:meta])*
+ $vis_e2sn:vis fn $name_e2sn:ident() / $cname_e2sn:ident() -> $mapper_e2sn:ident / $mapper_e2snb:ident;
+ $(#[$attr_e2ln:meta])*
+ $vis_e2ln:vis fn $name_e2ln:ident() / $cname_e2ln:ident() -> $mapper_e2ln:ident / $mapper_e2lnb:ident;
+ )?
+ }
+ ) => {
+ impl $ty {
+ $(#[$attr_n2e])*
+ #[cfg(feature = "compiled_data")]
+ $vis_n2e fn $cname_n2e() -> PropertyValueNameToEnumMapperBorrowed<'static, $ty> {
+ PropertyValueNameToEnumMapperBorrowed {
+ map: crate::provider::Baked::$singleton_n2e,
+ markers: PhantomData,
+ }
+ }
+
+ #[doc = concat!("A version of [`", stringify!($ty), "::", stringify!($cname_n2e), "()`] that uses custom data provided by a [`DataProvider`].")]
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ $vis_n2e fn $name_n2e(
+ provider: &(impl DataProvider<$marker_n2e> + ?Sized)
+ ) -> Result<PropertyValueNameToEnumMapper<$ty>, PropertiesError> {
+ Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(PropertyValueNameToEnumMapper::from_data)?)
+ }
+
+ $(
+ $(#[$attr_e2sn])*
+ #[cfg(feature = "compiled_data")]
+ $vis_e2sn fn $cname_e2sn() -> $mapper_e2snb<'static, $ty> {
+ $mapper_e2snb {
+ map: crate::provider::Baked::$singleton_e2sn,
+ markers: PhantomData,
+ }
+ }
+
+ #[doc = concat!("A version of [`", stringify!($ty), "::", stringify!($cname_e2sn), "()`] that uses custom data provided by a [`DataProvider`].")]
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ $vis_e2sn fn $name_e2sn(
+ provider: &(impl DataProvider<$marker_e2sn> + ?Sized)
+ ) -> Result<$mapper_e2sn<$ty>, PropertiesError> {
+ Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map($mapper_e2sn::from_data)?)
+ }
+
+ $(#[$attr_e2ln])*
+ #[cfg(feature = "compiled_data")]
+ $vis_e2ln fn $cname_e2ln() -> $mapper_e2lnb<'static, $ty> {
+ $mapper_e2lnb {
+ map: crate::provider::Baked::$singleton_e2ln,
+ markers: PhantomData,
+ }
+ }
+
+ #[doc = concat!("A version of [`", stringify!($ty), "::", stringify!($cname_e2ln), "()`] that uses custom data provided by a [`DataProvider`].")]
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ $vis_e2ln fn $name_e2ln(
+ provider: &(impl DataProvider<$marker_e2ln> + ?Sized)
+ ) -> Result<$mapper_e2ln<$ty>, PropertiesError> {
+ Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map($mapper_e2ln::from_data)?)
+ }
+ )?
+ }
+ }
+}
+
+/// Enumerated property Bidi_Class
+///
+/// These are the categories required by the Unicode Bidirectional Algorithm.
+/// For the property values, see [Bidirectional Class Values](https://unicode.org/reports/tr44/#Bidi_Class_Values).
+/// For more information, see [Unicode Standard Annex #9](https://unicode.org/reports/tr41/tr41-28.html#UAX9).
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(BidiClassULE)]
+pub struct BidiClass(pub u8);
+
+#[allow(non_upper_case_globals)]
+impl BidiClass {
+ /// (`L`) any strong left-to-right character
+ pub const LeftToRight: BidiClass = BidiClass(0);
+ /// (`R`) any strong right-to-left (non-Arabic-type) character
+ pub const RightToLeft: BidiClass = BidiClass(1);
+ /// (`EN`) any ASCII digit or Eastern Arabic-Indic digit
+ pub const EuropeanNumber: BidiClass = BidiClass(2);
+ /// (`ES`) plus and minus signs
+ pub const EuropeanSeparator: BidiClass = BidiClass(3);
+ /// (`ET`) a terminator in a numeric format context, includes currency signs
+ pub const EuropeanTerminator: BidiClass = BidiClass(4);
+ /// (`AN`) any Arabic-Indic digit
+ pub const ArabicNumber: BidiClass = BidiClass(5);
+ /// (`CS`) commas, colons, and slashes
+ pub const CommonSeparator: BidiClass = BidiClass(6);
+ /// (`B`) various newline characters
+ pub const ParagraphSeparator: BidiClass = BidiClass(7);
+ /// (`S`) various segment-related control codes
+ pub const SegmentSeparator: BidiClass = BidiClass(8);
+ /// (`WS`) spaces
+ pub const WhiteSpace: BidiClass = BidiClass(9);
+ /// (`ON`) most other symbols and punctuation marks
+ pub const OtherNeutral: BidiClass = BidiClass(10);
+ /// (`LRE`) U+202A: the LR embedding control
+ pub const LeftToRightEmbedding: BidiClass = BidiClass(11);
+ /// (`LRO`) U+202D: the LR override control
+ pub const LeftToRightOverride: BidiClass = BidiClass(12);
+ /// (`AL`) any strong right-to-left (Arabic-type) character
+ pub const ArabicLetter: BidiClass = BidiClass(13);
+ /// (`RLE`) U+202B: the RL embedding control
+ pub const RightToLeftEmbedding: BidiClass = BidiClass(14);
+ /// (`RLO`) U+202E: the RL override control
+ pub const RightToLeftOverride: BidiClass = BidiClass(15);
+ /// (`PDF`) U+202C: terminates an embedding or override control
+ pub const PopDirectionalFormat: BidiClass = BidiClass(16);
+ /// (`NSM`) any nonspacing mark
+ pub const NonspacingMark: BidiClass = BidiClass(17);
+ /// (`BN`) most format characters, control codes, or noncharacters
+ pub const BoundaryNeutral: BidiClass = BidiClass(18);
+ /// (`FSI`) U+2068: the first strong isolate control
+ pub const FirstStrongIsolate: BidiClass = BidiClass(19);
+ /// (`LRI`) U+2066: the LR isolate control
+ pub const LeftToRightIsolate: BidiClass = BidiClass(20);
+ /// (`RLI`) U+2067: the RL isolate control
+ pub const RightToLeftIsolate: BidiClass = BidiClass(21);
+ /// (`PDI`) U+2069: terminates an isolate control
+ pub const PopDirectionalIsolate: BidiClass = BidiClass(22);
+}
+
+impl_value_getter! {
+ markers: BidiClassNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_BC_V1, BidiClassValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_BC_V1, BidiClassValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_BC_V1;
+ impl BidiClass {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Bidi_Class` enumerated property
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::BidiClass;
+ ///
+ /// let lookup = BidiClass::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("AN"), Some(BidiClass::ArabicNumber));
+ /// assert_eq!(lookup.get_strict("NSM"), Some(BidiClass::NonspacingMark));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Arabic_Number"), Some(BidiClass::ArabicNumber));
+ /// assert_eq!(lookup.get_strict("Nonspacing_Mark"), Some(BidiClass::NonspacingMark));
+ /// // name has incorrect casing
+ /// assert_eq!(lookup.get_strict("arabicnumber"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("arabicnumber"), Some(BidiClass::ArabicNumber));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Upside_Down_Vertical_Backwards_Mirrored"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Bidi_Class` enumerated property
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::BidiClass;
+ ///
+ /// let lookup = BidiClass::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(BidiClass::ArabicNumber), Some("AN"));
+ /// assert_eq!(lookup.get(BidiClass::NonspacingMark), Some("NSM"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `Bidi_Class` enumerated property
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::BidiClass;
+ ///
+ /// let lookup = BidiClass::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(BidiClass::ArabicNumber), Some("Arabic_Number"));
+ /// assert_eq!(lookup.get(BidiClass::NonspacingMark), Some("Nonspacing_Mark"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+/// Enumerated property General_Category.
+///
+/// General_Category specifies the most general classification of a code point, usually
+/// determined based on the primary characteristic of the assigned character. For example, is the
+/// character a letter, a mark, a number, punctuation, or a symbol, and if so, of what type?
+///
+/// GeneralCategory only supports specific subcategories (eg `UppercaseLetter`).
+/// It does not support grouped categories (eg `Letter`). For grouped categories, use [`GeneralCategoryGroup`].
+#[derive(Copy, Clone, PartialEq, Eq, Debug, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_enums)] // this type is stable
+#[zerovec::make_ule(GeneralCategoryULE)]
+#[repr(u8)]
+pub enum GeneralCategory {
+ /// (`Cn`) A reserved unassigned code point or a noncharacter
+ Unassigned = 0,
+
+ /// (`Lu`) An uppercase letter
+ UppercaseLetter = 1,
+ /// (`Ll`) A lowercase letter
+ LowercaseLetter = 2,
+ /// (`Lt`) A digraphic letter, with first part uppercase
+ TitlecaseLetter = 3,
+ /// (`Lm`) A modifier letter
+ ModifierLetter = 4,
+ /// (`Lo`) Other letters, including syllables and ideographs
+ OtherLetter = 5,
+
+ /// (`Mn`) A nonspacing combining mark (zero advance width)
+ NonspacingMark = 6,
+ /// (`Mc`) A spacing combining mark (positive advance width)
+ SpacingMark = 8,
+ /// (`Me`) An enclosing combining mark
+ EnclosingMark = 7,
+
+ /// (`Nd`) A decimal digit
+ DecimalNumber = 9,
+ /// (`Nl`) A letterlike numeric character
+ LetterNumber = 10,
+ /// (`No`) A numeric character of other type
+ OtherNumber = 11,
+
+ /// (`Zs`) A space character (of various non-zero widths)
+ SpaceSeparator = 12,
+ /// (`Zl`) U+2028 LINE SEPARATOR only
+ LineSeparator = 13,
+ /// (`Zp`) U+2029 PARAGRAPH SEPARATOR only
+ ParagraphSeparator = 14,
+
+ /// (`Cc`) A C0 or C1 control code
+ Control = 15,
+ /// (`Cf`) A format control character
+ Format = 16,
+ /// (`Co`) A private-use character
+ PrivateUse = 17,
+ /// (`Cs`) A surrogate code point
+ Surrogate = 18,
+
+ /// (`Pd`) A dash or hyphen punctuation mark
+ DashPunctuation = 19,
+ /// (`Ps`) An opening punctuation mark (of a pair)
+ OpenPunctuation = 20,
+ /// (`Pe`) A closing punctuation mark (of a pair)
+ ClosePunctuation = 21,
+ /// (`Pc`) A connecting punctuation mark, like a tie
+ ConnectorPunctuation = 22,
+ /// (`Pi`) An initial quotation mark
+ InitialPunctuation = 28,
+ /// (`Pf`) A final quotation mark
+ FinalPunctuation = 29,
+ /// (`Po`) A punctuation mark of other type
+ OtherPunctuation = 23,
+
+ /// (`Sm`) A symbol of mathematical use
+ MathSymbol = 24,
+ /// (`Sc`) A currency sign
+ CurrencySymbol = 25,
+ /// (`Sk`) A non-letterlike modifier symbol
+ ModifierSymbol = 26,
+ /// (`So`) A symbol of other type
+ OtherSymbol = 27,
+}
+
+impl_value_getter! {
+ markers: GeneralCategoryNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_GC_V1, GeneralCategoryValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_GC_V1, GeneralCategoryValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_GC_V1;
+ impl GeneralCategory {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `General_Category` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("Lu"), Some(GeneralCategory::UppercaseLetter));
+ /// assert_eq!(lookup.get_strict("Pd"), Some(GeneralCategory::DashPunctuation));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Uppercase_Letter"), Some(GeneralCategory::UppercaseLetter));
+ /// assert_eq!(lookup.get_strict("Dash_Punctuation"), Some(GeneralCategory::DashPunctuation));
+ /// // name has incorrect casing
+ /// assert_eq!(lookup.get_strict("dashpunctuation"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("dash-punctuation"), Some(GeneralCategory::DashPunctuation));
+ /// // fake property
+ /// assert_eq!(lookup.get_loose("Animated_Gif"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `General_Category` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(GeneralCategory::UppercaseLetter), Some("Lu"));
+ /// assert_eq!(lookup.get(GeneralCategory::DashPunctuation), Some("Pd"));
+ /// assert_eq!(lookup.get(GeneralCategory::FinalPunctuation), Some("Pf"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `General_Category` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GeneralCategory;
+ ///
+ /// let lookup = GeneralCategory::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(GeneralCategory::UppercaseLetter), Some("Uppercase_Letter"));
+ /// assert_eq!(lookup.get(GeneralCategory::DashPunctuation), Some("Dash_Punctuation"));
+ /// assert_eq!(lookup.get(GeneralCategory::FinalPunctuation), Some("Final_Punctuation"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash, Default)]
+pub struct GeneralCategoryTryFromError;
+
+impl TryFrom<u8> for GeneralCategory {
+ type Error = GeneralCategoryTryFromError;
+ /// Construct this [`GeneralCategory`] from an integer, returning
+ /// an error if it is out of bounds
+ fn try_from(val: u8) -> Result<Self, GeneralCategoryTryFromError> {
+ GeneralCategory::new_from_u8(val).ok_or(GeneralCategoryTryFromError)
+ }
+}
+
+/// Groupings of multiple General_Category property values.
+///
+/// Instances of `GeneralCategoryGroup` represent the defined multi-category
+/// values that are useful for users in certain contexts, such as regex. In
+/// other words, unlike [`GeneralCategory`], this supports groups of general
+/// categories: for example, `Letter` /// is the union of `UppercaseLetter`,
+/// `LowercaseLetter`, etc.
+///
+/// See <https://www.unicode.org/reports/tr44/> .
+///
+/// The discriminants correspond to the `U_GC_XX_MASK` constants in ICU4C.
+/// Unlike [`GeneralCategory`], this supports groups of general categories: for example, `Letter`
+/// is the union of `UppercaseLetter`, `LowercaseLetter`, etc.
+///
+/// See `UCharCategory` and `U_GET_GC_MASK` in ICU4C.
+#[derive(Copy, Clone, PartialEq, Debug, Eq)]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+pub struct GeneralCategoryGroup(pub(crate) u32);
+
+use GeneralCategory as GC;
+use GeneralCategoryGroup as GCG;
+
+#[allow(non_upper_case_globals)]
+impl GeneralCategoryGroup {
+ /// (`Lu`) An uppercase letter
+ pub const UppercaseLetter: GeneralCategoryGroup = GCG(1 << (GC::UppercaseLetter as u32));
+ /// (`Ll`) A lowercase letter
+ pub const LowercaseLetter: GeneralCategoryGroup = GCG(1 << (GC::LowercaseLetter as u32));
+ /// (`Lt`) A digraphic letter, with first part uppercase
+ pub const TitlecaseLetter: GeneralCategoryGroup = GCG(1 << (GC::TitlecaseLetter as u32));
+ /// (`Lm`) A modifier letter
+ pub const ModifierLetter: GeneralCategoryGroup = GCG(1 << (GC::ModifierLetter as u32));
+ /// (`Lo`) Other letters, including syllables and ideographs
+ pub const OtherLetter: GeneralCategoryGroup = GCG(1 << (GC::OtherLetter as u32));
+ /// (`LC`) The union of UppercaseLetter, LowercaseLetter, and TitlecaseLetter
+ pub const CasedLetter: GeneralCategoryGroup = GCG(1 << (GC::UppercaseLetter as u32)
+ | 1 << (GC::LowercaseLetter as u32)
+ | 1 << (GC::TitlecaseLetter as u32));
+ /// (`L`) The union of all letter categories
+ pub const Letter: GeneralCategoryGroup = GCG(1 << (GC::UppercaseLetter as u32)
+ | 1 << (GC::LowercaseLetter as u32)
+ | 1 << (GC::TitlecaseLetter as u32)
+ | 1 << (GC::ModifierLetter as u32)
+ | 1 << (GC::OtherLetter as u32));
+
+ /// (`Mn`) A nonspacing combining mark (zero advance width)
+ pub const NonspacingMark: GeneralCategoryGroup = GCG(1 << (GC::NonspacingMark as u32));
+ /// (`Mc`) A spacing combining mark (positive advance width)
+ pub const EnclosingMark: GeneralCategoryGroup = GCG(1 << (GC::EnclosingMark as u32));
+ /// (`Me`) An enclosing combining mark
+ pub const SpacingMark: GeneralCategoryGroup = GCG(1 << (GC::SpacingMark as u32));
+ /// (`M`) The union of all mark categories
+ pub const Mark: GeneralCategoryGroup = GCG(1 << (GC::NonspacingMark as u32)
+ | 1 << (GC::EnclosingMark as u32)
+ | 1 << (GC::SpacingMark as u32));
+
+ /// (`Nd`) A decimal digit
+ pub const DecimalNumber: GeneralCategoryGroup = GCG(1 << (GC::DecimalNumber as u32));
+ /// (`Nl`) A letterlike numeric character
+ pub const LetterNumber: GeneralCategoryGroup = GCG(1 << (GC::LetterNumber as u32));
+ /// (`No`) A numeric character of other type
+ pub const OtherNumber: GeneralCategoryGroup = GCG(1 << (GC::OtherNumber as u32));
+ /// (`N`) The union of all number categories
+ pub const Number: GeneralCategoryGroup = GCG(1 << (GC::DecimalNumber as u32)
+ | 1 << (GC::LetterNumber as u32)
+ | 1 << (GC::OtherNumber as u32));
+
+ /// (`Zs`) A space character (of various non-zero widths)
+ pub const SpaceSeparator: GeneralCategoryGroup = GCG(1 << (GC::SpaceSeparator as u32));
+ /// (`Zl`) U+2028 LINE SEPARATOR only
+ pub const LineSeparator: GeneralCategoryGroup = GCG(1 << (GC::LineSeparator as u32));
+ /// (`Zp`) U+2029 PARAGRAPH SEPARATOR only
+ pub const ParagraphSeparator: GeneralCategoryGroup = GCG(1 << (GC::ParagraphSeparator as u32));
+ /// (`Z`) The union of all separator categories
+ pub const Separator: GeneralCategoryGroup = GCG(1 << (GC::SpaceSeparator as u32)
+ | 1 << (GC::LineSeparator as u32)
+ | 1 << (GC::ParagraphSeparator as u32));
+
+ /// (`Cc`) A C0 or C1 control code
+ pub const Control: GeneralCategoryGroup = GCG(1 << (GC::Control as u32));
+ /// (`Cf`) A format control character
+ pub const Format: GeneralCategoryGroup = GCG(1 << (GC::Format as u32));
+ /// (`Co`) A private-use character
+ pub const PrivateUse: GeneralCategoryGroup = GCG(1 << (GC::PrivateUse as u32));
+ /// (`Cs`) A surrogate code point
+ pub const Surrogate: GeneralCategoryGroup = GCG(1 << (GC::Surrogate as u32));
+ /// (`Cn`) A reserved unassigned code point or a noncharacter
+ pub const Unassigned: GeneralCategoryGroup = GCG(1 << (GC::Unassigned as u32));
+ /// (`C`) The union of all control code, reserved, and unassigned categories
+ pub const Other: GeneralCategoryGroup = GCG(1 << (GC::Control as u32)
+ | 1 << (GC::Format as u32)
+ | 1 << (GC::PrivateUse as u32)
+ | 1 << (GC::Surrogate as u32)
+ | 1 << (GC::Unassigned as u32));
+
+ /// (`Pd`) A dash or hyphen punctuation mark
+ pub const DashPunctuation: GeneralCategoryGroup = GCG(1 << (GC::DashPunctuation as u32));
+ /// (`Ps`) An opening punctuation mark (of a pair)
+ pub const OpenPunctuation: GeneralCategoryGroup = GCG(1 << (GC::OpenPunctuation as u32));
+ /// (`Pe`) A closing punctuation mark (of a pair)
+ pub const ClosePunctuation: GeneralCategoryGroup = GCG(1 << (GC::ClosePunctuation as u32));
+ /// (`Pc`) A connecting punctuation mark, like a tie
+ pub const ConnectorPunctuation: GeneralCategoryGroup =
+ GCG(1 << (GC::ConnectorPunctuation as u32));
+ /// (`Pi`) An initial quotation mark
+ pub const InitialPunctuation: GeneralCategoryGroup = GCG(1 << (GC::InitialPunctuation as u32));
+ /// (`Pf`) A final quotation mark
+ pub const FinalPunctuation: GeneralCategoryGroup = GCG(1 << (GC::FinalPunctuation as u32));
+ /// (`Po`) A punctuation mark of other type
+ pub const OtherPunctuation: GeneralCategoryGroup = GCG(1 << (GC::OtherPunctuation as u32));
+ /// (`P`) The union of all punctuation categories
+ pub const Punctuation: GeneralCategoryGroup = GCG(1 << (GC::DashPunctuation as u32)
+ | 1 << (GC::OpenPunctuation as u32)
+ | 1 << (GC::ClosePunctuation as u32)
+ | 1 << (GC::ConnectorPunctuation as u32)
+ | 1 << (GC::OtherPunctuation as u32)
+ | 1 << (GC::InitialPunctuation as u32)
+ | 1 << (GC::FinalPunctuation as u32));
+
+ /// (`Sm`) A symbol of mathematical use
+ pub const MathSymbol: GeneralCategoryGroup = GCG(1 << (GC::MathSymbol as u32));
+ /// (`Sc`) A currency sign
+ pub const CurrencySymbol: GeneralCategoryGroup = GCG(1 << (GC::CurrencySymbol as u32));
+ /// (`Sk`) A non-letterlike modifier symbol
+ pub const ModifierSymbol: GeneralCategoryGroup = GCG(1 << (GC::ModifierSymbol as u32));
+ /// (`So`) A symbol of other type
+ pub const OtherSymbol: GeneralCategoryGroup = GCG(1 << (GC::OtherSymbol as u32));
+ /// (`S`) The union of all symbol categories
+ pub const Symbol: GeneralCategoryGroup = GCG(1 << (GC::MathSymbol as u32)
+ | 1 << (GC::CurrencySymbol as u32)
+ | 1 << (GC::ModifierSymbol as u32)
+ | 1 << (GC::OtherSymbol as u32));
+
+ const ALL: u32 = (1 << (GC::FinalPunctuation as u32 + 1)) - 1;
+
+ /// Return whether the code point belongs in the provided multi-value category.
+ ///
+ /// ```
+ /// use icu::properties::{maps, GeneralCategory, GeneralCategoryGroup};
+ /// use icu_collections::codepointtrie::CodePointTrie;
+ ///
+ /// let gc = maps::general_category();
+ ///
+ /// assert_eq!(gc.get('A'), GeneralCategory::UppercaseLetter);
+ /// assert!(GeneralCategoryGroup::CasedLetter.contains(gc.get('A')));
+ ///
+ /// // U+0B1E ORIYA LETTER NYA
+ /// assert_eq!(gc.get('ଞ'), GeneralCategory::OtherLetter);
+ /// assert!(GeneralCategoryGroup::Letter.contains(gc.get('ଞ')));
+ /// assert!(!GeneralCategoryGroup::CasedLetter.contains(gc.get('ଞ')));
+ ///
+ /// // U+0301 COMBINING ACUTE ACCENT
+ /// assert_eq!(gc.get32(0x0301), GeneralCategory::NonspacingMark);
+ /// assert!(GeneralCategoryGroup::Mark.contains(gc.get32(0x0301)));
+ /// assert!(!GeneralCategoryGroup::Letter.contains(gc.get32(0x0301)));
+ ///
+ /// assert_eq!(gc.get('0'), GeneralCategory::DecimalNumber);
+ /// assert!(GeneralCategoryGroup::Number.contains(gc.get('0')));
+ /// assert!(!GeneralCategoryGroup::Mark.contains(gc.get('0')));
+ ///
+ /// assert_eq!(gc.get('('), GeneralCategory::OpenPunctuation);
+ /// assert!(GeneralCategoryGroup::Punctuation.contains(gc.get('(')));
+ /// assert!(!GeneralCategoryGroup::Number.contains(gc.get('(')));
+ ///
+ /// // U+2713 CHECK MARK
+ /// assert_eq!(gc.get('✓'), GeneralCategory::OtherSymbol);
+ /// assert!(GeneralCategoryGroup::Symbol.contains(gc.get('✓')));
+ /// assert!(!GeneralCategoryGroup::Punctuation.contains(gc.get('✓')));
+ ///
+ /// assert_eq!(gc.get(' '), GeneralCategory::SpaceSeparator);
+ /// assert!(GeneralCategoryGroup::Separator.contains(gc.get(' ')));
+ /// assert!(!GeneralCategoryGroup::Symbol.contains(gc.get(' ')));
+ ///
+ /// // U+E007F CANCEL TAG
+ /// assert_eq!(gc.get32(0xE007F), GeneralCategory::Format);
+ /// assert!(GeneralCategoryGroup::Other.contains(gc.get32(0xE007F)));
+ /// assert!(!GeneralCategoryGroup::Separator.contains(gc.get32(0xE007F)));
+ /// ```
+ pub const fn contains(&self, val: GeneralCategory) -> bool {
+ 0 != (1 << (val as u32)) & self.0
+ }
+
+ /// Produce a GeneralCategoryGroup that is the inverse of this one
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::{GeneralCategory, GeneralCategoryGroup};
+ ///
+ /// let letter = GeneralCategoryGroup::Letter;
+ /// let not_letter = letter.complement();
+ ///
+ /// assert!(not_letter.contains(GeneralCategory::MathSymbol));
+ /// assert!(!letter.contains(GeneralCategory::MathSymbol));
+ /// assert!(not_letter.contains(GeneralCategory::OtherPunctuation));
+ /// assert!(!letter.contains(GeneralCategory::OtherPunctuation));
+ /// assert!(!not_letter.contains(GeneralCategory::UppercaseLetter));
+ /// assert!(letter.contains(GeneralCategory::UppercaseLetter));
+ /// ```
+ pub const fn complement(self) -> Self {
+ // Mask off things not in Self::ALL to guarantee the mask
+ // values stay in-range
+ GeneralCategoryGroup(!self.0 & Self::ALL)
+ }
+
+ /// Return the group representing all GeneralCategory values
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::{GeneralCategory, GeneralCategoryGroup};
+ ///
+ /// let all = GeneralCategoryGroup::all();
+ ///
+ /// assert!(all.contains(GeneralCategory::MathSymbol));
+ /// assert!(all.contains(GeneralCategory::OtherPunctuation));
+ /// assert!(all.contains(GeneralCategory::UppercaseLetter));
+ /// ```
+ pub const fn all() -> Self {
+ Self(Self::ALL)
+ }
+
+ /// Return the empty group
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::{GeneralCategory, GeneralCategoryGroup};
+ ///
+ /// let empty = GeneralCategoryGroup::empty();
+ ///
+ /// assert!(!empty.contains(GeneralCategory::MathSymbol));
+ /// assert!(!empty.contains(GeneralCategory::OtherPunctuation));
+ /// assert!(!empty.contains(GeneralCategory::UppercaseLetter));
+ /// ```
+ pub const fn empty() -> Self {
+ Self(0)
+ }
+
+ /// Take the union of two groups
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::{GeneralCategory, GeneralCategoryGroup};
+ ///
+ /// let letter = GeneralCategoryGroup::Letter;
+ /// let symbol = GeneralCategoryGroup::Symbol;
+ /// let union = letter.union(symbol);
+ ///
+ /// assert!(union.contains(GeneralCategory::MathSymbol));
+ /// assert!(!union.contains(GeneralCategory::OtherPunctuation));
+ /// assert!(union.contains(GeneralCategory::UppercaseLetter));
+ /// ```
+ pub const fn union(self, other: Self) -> Self {
+ Self(self.0 | other.0)
+ }
+
+ /// Take the intersection of two groups
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use icu::properties::{GeneralCategory, GeneralCategoryGroup};
+ ///
+ /// let letter = GeneralCategoryGroup::Letter;
+ /// let lu = GeneralCategoryGroup::UppercaseLetter;
+ /// let intersection = letter.intersection(lu);
+ ///
+ /// assert!(!intersection.contains(GeneralCategory::MathSymbol));
+ /// assert!(!intersection.contains(GeneralCategory::OtherPunctuation));
+ /// assert!(intersection.contains(GeneralCategory::UppercaseLetter));
+ /// assert!(!intersection.contains(GeneralCategory::LowercaseLetter));
+ /// ```
+ pub const fn intersection(self, other: Self) -> Self {
+ Self(self.0 & other.0)
+ }
+}
+
+impl_value_getter! {
+ markers: GeneralCategoryMaskNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_GCM_V1;
+ impl GeneralCategoryGroup {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `General_Category_Mask` mask property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GeneralCategoryGroup;
+ ///
+ /// let lookup = GeneralCategoryGroup::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("L"), Some(GeneralCategoryGroup::Letter));
+ /// assert_eq!(lookup.get_strict("LC"), Some(GeneralCategoryGroup::CasedLetter));
+ /// assert_eq!(lookup.get_strict("Lu"), Some(GeneralCategoryGroup::UppercaseLetter));
+ /// assert_eq!(lookup.get_strict("Zp"), Some(GeneralCategoryGroup::ParagraphSeparator));
+ /// assert_eq!(lookup.get_strict("P"), Some(GeneralCategoryGroup::Punctuation));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Letter"), Some(GeneralCategoryGroup::Letter));
+ /// assert_eq!(lookup.get_strict("Cased_Letter"), Some(GeneralCategoryGroup::CasedLetter));
+ /// assert_eq!(lookup.get_strict("Uppercase_Letter"), Some(GeneralCategoryGroup::UppercaseLetter));
+ /// // alias name
+ /// assert_eq!(lookup.get_strict("punct"), Some(GeneralCategoryGroup::Punctuation));
+ /// // name has incorrect casing
+ /// assert_eq!(lookup.get_strict("letter"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("letter"), Some(GeneralCategoryGroup::Letter));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("EverythingLol"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ }
+}
+
+impl From<GeneralCategory> for GeneralCategoryGroup {
+ fn from(subcategory: GeneralCategory) -> Self {
+ GeneralCategoryGroup(1 << (subcategory as u32))
+ }
+}
+impl From<u32> for GeneralCategoryGroup {
+ fn from(mask: u32) -> Self {
+ // Mask off things not in Self::ALL to guarantee the mask
+ // values stay in-range
+ GeneralCategoryGroup(mask & Self::ALL)
+ }
+}
+impl From<GeneralCategoryGroup> for u32 {
+ fn from(group: GeneralCategoryGroup) -> Self {
+ group.0
+ }
+}
+/// Enumerated property Script.
+///
+/// This is used with both the Script and Script_Extensions Unicode properties.
+/// Each character is assigned a single Script, but characters that are used in
+/// a particular subset of scripts will be in more than one Script_Extensions set.
+/// For example, DEVANAGARI DIGIT NINE has Script=Devanagari, but is also in the
+/// Script_Extensions set for Dogra, Kaithi, and Mahajani.
+///
+/// For more information, see UAX #24: <http://www.unicode.org/reports/tr24/>.
+/// See `UScriptCode` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(ScriptULE)]
+pub struct Script(pub u16);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl Script {
+ pub const Adlam: Script = Script(167);
+ pub const Ahom: Script = Script(161);
+ pub const AnatolianHieroglyphs: Script = Script(156);
+ pub const Arabic: Script = Script(2);
+ pub const Armenian: Script = Script(3);
+ pub const Avestan: Script = Script(117);
+ pub const Balinese: Script = Script(62);
+ pub const Bamum: Script = Script(130);
+ pub const BassaVah: Script = Script(134);
+ pub const Batak: Script = Script(63);
+ pub const Bengali: Script = Script(4);
+ pub const Bhaiksuki: Script = Script(168);
+ pub const Bopomofo: Script = Script(5);
+ pub const Brahmi: Script = Script(65);
+ pub const Braille: Script = Script(46);
+ pub const Buginese: Script = Script(55);
+ pub const Buhid: Script = Script(44);
+ pub const CanadianAboriginal: Script = Script(40);
+ pub const Carian: Script = Script(104);
+ pub const CaucasianAlbanian: Script = Script(159);
+ pub const Chakma: Script = Script(118);
+ pub const Cham: Script = Script(66);
+ pub const Cherokee: Script = Script(6);
+ pub const Chorasmian: Script = Script(189);
+ pub const Common: Script = Script(0);
+ pub const Coptic: Script = Script(7);
+ pub const Cuneiform: Script = Script(101);
+ pub const Cypriot: Script = Script(47);
+ pub const CyproMinoan: Script = Script(193);
+ pub const Cyrillic: Script = Script(8);
+ pub const Deseret: Script = Script(9);
+ pub const Devanagari: Script = Script(10);
+ pub const DivesAkuru: Script = Script(190);
+ pub const Dogra: Script = Script(178);
+ pub const Duployan: Script = Script(135);
+ pub const EgyptianHieroglyphs: Script = Script(71);
+ pub const Elbasan: Script = Script(136);
+ pub const Elymaic: Script = Script(185);
+ pub const Ethiopian: Script = Script(11);
+ pub const Georgian: Script = Script(12);
+ pub const Glagolitic: Script = Script(56);
+ pub const Gothic: Script = Script(13);
+ pub const Grantha: Script = Script(137);
+ pub const Greek: Script = Script(14);
+ pub const Gujarati: Script = Script(15);
+ pub const GunjalaGondi: Script = Script(179);
+ pub const Gurmukhi: Script = Script(16);
+ pub const Han: Script = Script(17);
+ pub const Hangul: Script = Script(18);
+ pub const HanifiRohingya: Script = Script(182);
+ pub const Hanunoo: Script = Script(43);
+ pub const Hatran: Script = Script(162);
+ pub const Hebrew: Script = Script(19);
+ pub const Hiragana: Script = Script(20);
+ pub const ImperialAramaic: Script = Script(116);
+ pub const Inherited: Script = Script(1);
+ pub const InscriptionalPahlavi: Script = Script(122);
+ pub const InscriptionalParthian: Script = Script(125);
+ pub const Javanese: Script = Script(78);
+ pub const Kaithi: Script = Script(120);
+ pub const Kannada: Script = Script(21);
+ pub const Katakana: Script = Script(22);
+ pub const Kawi: Script = Script(198);
+ pub const KayahLi: Script = Script(79);
+ pub const Kharoshthi: Script = Script(57);
+ pub const KhitanSmallScript: Script = Script(191);
+ pub const Khmer: Script = Script(23);
+ pub const Khojki: Script = Script(157);
+ pub const Khudawadi: Script = Script(145);
+ pub const Lao: Script = Script(24);
+ pub const Latin: Script = Script(25);
+ pub const Lepcha: Script = Script(82);
+ pub const Limbu: Script = Script(48);
+ pub const LinearA: Script = Script(83);
+ pub const LinearB: Script = Script(49);
+ pub const Lisu: Script = Script(131);
+ pub const Lycian: Script = Script(107);
+ pub const Lydian: Script = Script(108);
+ pub const Mahajani: Script = Script(160);
+ pub const Makasar: Script = Script(180);
+ pub const Malayalam: Script = Script(26);
+ pub const Mandaic: Script = Script(84);
+ pub const Manichaean: Script = Script(121);
+ pub const Marchen: Script = Script(169);
+ pub const MasaramGondi: Script = Script(175);
+ pub const Medefaidrin: Script = Script(181);
+ pub const MeeteiMayek: Script = Script(115);
+ pub const MendeKikakui: Script = Script(140);
+ pub const MeroiticCursive: Script = Script(141);
+ pub const MeroiticHieroglyphs: Script = Script(86);
+ pub const Miao: Script = Script(92);
+ pub const Modi: Script = Script(163);
+ pub const Mongolian: Script = Script(27);
+ pub const Mro: Script = Script(149);
+ pub const Multani: Script = Script(164);
+ pub const Myanmar: Script = Script(28);
+ pub const Nabataean: Script = Script(143);
+ pub const NagMundari: Script = Script(199);
+ pub const Nandinagari: Script = Script(187);
+ pub const NewTaiLue: Script = Script(59);
+ pub const Newa: Script = Script(170);
+ pub const Nko: Script = Script(87);
+ pub const Nushu: Script = Script(150);
+ pub const NyiakengPuachueHmong: Script = Script(186);
+ pub const Ogham: Script = Script(29);
+ pub const OlChiki: Script = Script(109);
+ pub const OldHungarian: Script = Script(76);
+ pub const OldItalic: Script = Script(30);
+ pub const OldNorthArabian: Script = Script(142);
+ pub const OldPermic: Script = Script(89);
+ pub const OldPersian: Script = Script(61);
+ pub const OldSogdian: Script = Script(184);
+ pub const OldSouthArabian: Script = Script(133);
+ pub const OldTurkic: Script = Script(88);
+ pub const OldUyghur: Script = Script(194);
+ pub const Oriya: Script = Script(31);
+ pub const Osage: Script = Script(171);
+ pub const Osmanya: Script = Script(50);
+ pub const PahawhHmong: Script = Script(75);
+ pub const Palmyrene: Script = Script(144);
+ pub const PauCinHau: Script = Script(165);
+ pub const PhagsPa: Script = Script(90);
+ pub const Phoenician: Script = Script(91);
+ pub const PsalterPahlavi: Script = Script(123);
+ pub const Rejang: Script = Script(110);
+ pub const Runic: Script = Script(32);
+ pub const Samaritan: Script = Script(126);
+ pub const Saurashtra: Script = Script(111);
+ pub const Sharada: Script = Script(151);
+ pub const Shavian: Script = Script(51);
+ pub const Siddham: Script = Script(166);
+ pub const SignWriting: Script = Script(112);
+ pub const Sinhala: Script = Script(33);
+ pub const Sogdian: Script = Script(183);
+ pub const SoraSompeng: Script = Script(152);
+ pub const Soyombo: Script = Script(176);
+ pub const Sundanese: Script = Script(113);
+ pub const SylotiNagri: Script = Script(58);
+ pub const Syriac: Script = Script(34);
+ pub const Tagalog: Script = Script(42);
+ pub const Tagbanwa: Script = Script(45);
+ pub const TaiLe: Script = Script(52);
+ pub const TaiTham: Script = Script(106);
+ pub const TaiViet: Script = Script(127);
+ pub const Takri: Script = Script(153);
+ pub const Tamil: Script = Script(35);
+ pub const Tangsa: Script = Script(195);
+ pub const Tangut: Script = Script(154);
+ pub const Telugu: Script = Script(36);
+ pub const Thaana: Script = Script(37);
+ pub const Thai: Script = Script(38);
+ pub const Tibetan: Script = Script(39);
+ pub const Tifinagh: Script = Script(60);
+ pub const Tirhuta: Script = Script(158);
+ pub const Toto: Script = Script(196);
+ pub const Ugaritic: Script = Script(53);
+ pub const Unknown: Script = Script(103);
+ pub const Vai: Script = Script(99);
+ pub const Vithkuqi: Script = Script(197);
+ pub const Wancho: Script = Script(188);
+ pub const WarangCiti: Script = Script(146);
+ pub const Yezidi: Script = Script(192);
+ pub const Yi: Script = Script(41);
+ pub const ZanabazarSquare: Script = Script(177);
+}
+
+impl_value_getter! {
+ markers: ScriptNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_SC_V1, ScriptValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR4_SC_V1, ScriptValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_SC_V1;
+ impl Script {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Script` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::Script;
+ ///
+ /// let lookup = Script::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("Brah"), Some(Script::Brahmi));
+ /// assert_eq!(lookup.get_strict("Hang"), Some(Script::Hangul));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Brahmi"), Some(Script::Brahmi));
+ /// assert_eq!(lookup.get_strict("Hangul"), Some(Script::Hangul));
+ /// // name has incorrect casing
+ /// assert_eq!(lookup.get_strict("brahmi"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("brahmi"), Some(Script::Brahmi));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Linear_Z"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Script` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::Script;
+ /// use tinystr::tinystr;
+ ///
+ /// let lookup = Script::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(Script::Brahmi), Some(tinystr!(4, "Brah")));
+ /// assert_eq!(lookup.get(Script::Hangul), Some(tinystr!(4, "Hang")));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearTiny4Mapper / PropertyEnumToValueNameLinearTiny4MapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearTiny4Mapper`], capable of looking up long names
+ /// for values of the `Script` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::Script;
+ ///
+ /// let lookup = Script::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(Script::Brahmi), Some("Brahmi"));
+ /// assert_eq!(lookup.get(Script::Hangul), Some("Hangul"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+/// Enumerated property East_Asian_Width.
+///
+/// See "Definition" in UAX #11 for the summary of each property value:
+/// <https://www.unicode.org/reports/tr11/#Definitions>
+///
+/// The numeric value is compatible with `UEastAsianWidth` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(EastAsianWidthULE)]
+pub struct EastAsianWidth(pub u8);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl EastAsianWidth {
+ pub const Neutral: EastAsianWidth = EastAsianWidth(0); //name="N"
+ pub const Ambiguous: EastAsianWidth = EastAsianWidth(1); //name="A"
+ pub const Halfwidth: EastAsianWidth = EastAsianWidth(2); //name="H"
+ pub const Fullwidth: EastAsianWidth = EastAsianWidth(3); //name="F"
+ pub const Narrow: EastAsianWidth = EastAsianWidth(4); //name="Na"
+ pub const Wide: EastAsianWidth = EastAsianWidth(5); //name="W"
+}
+
+impl_value_getter! {
+ markers: EastAsianWidthNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_EA_V1, EastAsianWidthValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_EA_V1, EastAsianWidthValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_EA_V1;
+ impl EastAsianWidth {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `East_Asian_Width` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::EastAsianWidth;
+ ///
+ /// let lookup = EastAsianWidth::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("N"), Some(EastAsianWidth::Neutral));
+ /// assert_eq!(lookup.get_strict("H"), Some(EastAsianWidth::Halfwidth));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Neutral"), Some(EastAsianWidth::Neutral));
+ /// assert_eq!(lookup.get_strict("Halfwidth"), Some(EastAsianWidth::Halfwidth));
+ /// // name has incorrect casing / extra hyphen
+ /// assert_eq!(lookup.get_strict("half-width"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("half-width"), Some(EastAsianWidth::Halfwidth));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("TwoPointFiveWidth"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `East_Asian_Width` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::EastAsianWidth;
+ ///
+ /// let lookup = EastAsianWidth::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(EastAsianWidth::Neutral), Some("N"));
+ /// assert_eq!(lookup.get(EastAsianWidth::Halfwidth), Some("H"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `East_Asian_Width` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::EastAsianWidth;
+ ///
+ /// let lookup = EastAsianWidth::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(EastAsianWidth::Neutral), Some("Neutral"));
+ /// assert_eq!(lookup.get(EastAsianWidth::Halfwidth), Some("Halfwidth"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+/// Enumerated property Line_Break.
+///
+/// See "Line Breaking Properties" in UAX #14 for the summary of each property
+/// value: <https://www.unicode.org/reports/tr14/#Properties>
+///
+/// The numeric value is compatible with `ULineBreak` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(LineBreakULE)]
+pub struct LineBreak(pub u8);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl LineBreak {
+ pub const Unknown: LineBreak = LineBreak(0); // name="XX"
+ pub const Ambiguous: LineBreak = LineBreak(1); // name="AI"
+ pub const Alphabetic: LineBreak = LineBreak(2); // name="AL"
+ pub const BreakBoth: LineBreak = LineBreak(3); // name="B2"
+ pub const BreakAfter: LineBreak = LineBreak(4); // name="BA"
+ pub const BreakBefore: LineBreak = LineBreak(5); // name="BB"
+ pub const MandatoryBreak: LineBreak = LineBreak(6); // name="BK"
+ pub const ContingentBreak: LineBreak = LineBreak(7); // name="CB"
+ pub const ClosePunctuation: LineBreak = LineBreak(8); // name="CL"
+ pub const CombiningMark: LineBreak = LineBreak(9); // name="CM"
+ pub const CarriageReturn: LineBreak = LineBreak(10); // name="CR"
+ pub const Exclamation: LineBreak = LineBreak(11); // name="EX"
+ pub const Glue: LineBreak = LineBreak(12); // name="GL"
+ pub const Hyphen: LineBreak = LineBreak(13); // name="HY"
+ pub const Ideographic: LineBreak = LineBreak(14); // name="ID"
+ pub const Inseparable: LineBreak = LineBreak(15); // name="IN"
+ pub const InfixNumeric: LineBreak = LineBreak(16); // name="IS"
+ pub const LineFeed: LineBreak = LineBreak(17); // name="LF"
+ pub const Nonstarter: LineBreak = LineBreak(18); // name="NS"
+ pub const Numeric: LineBreak = LineBreak(19); // name="NU"
+ pub const OpenPunctuation: LineBreak = LineBreak(20); // name="OP"
+ pub const PostfixNumeric: LineBreak = LineBreak(21); // name="PO"
+ pub const PrefixNumeric: LineBreak = LineBreak(22); // name="PR"
+ pub const Quotation: LineBreak = LineBreak(23); // name="QU"
+ pub const ComplexContext: LineBreak = LineBreak(24); // name="SA"
+ pub const Surrogate: LineBreak = LineBreak(25); // name="SG"
+ pub const Space: LineBreak = LineBreak(26); // name="SP"
+ pub const BreakSymbols: LineBreak = LineBreak(27); // name="SY"
+ pub const ZWSpace: LineBreak = LineBreak(28); // name="ZW"
+ pub const NextLine: LineBreak = LineBreak(29); // name="NL"
+ pub const WordJoiner: LineBreak = LineBreak(30); // name="WJ"
+ pub const H2: LineBreak = LineBreak(31); // name="H2"
+ pub const H3: LineBreak = LineBreak(32); // name="H3"
+ pub const JL: LineBreak = LineBreak(33); // name="JL"
+ pub const JT: LineBreak = LineBreak(34); // name="JT"
+ pub const JV: LineBreak = LineBreak(35); // name="JV"
+ pub const CloseParenthesis: LineBreak = LineBreak(36); // name="CP"
+ pub const ConditionalJapaneseStarter: LineBreak = LineBreak(37); // name="CJ"
+ pub const HebrewLetter: LineBreak = LineBreak(38); // name="HL"
+ pub const RegionalIndicator: LineBreak = LineBreak(39); // name="RI"
+ pub const EBase: LineBreak = LineBreak(40); // name="EB"
+ pub const EModifier: LineBreak = LineBreak(41); // name="EM"
+ pub const ZWJ: LineBreak = LineBreak(42); // name="ZWJ"
+
+ // Added in ICU 74:
+ pub const Aksara: LineBreak = LineBreak(43); // name="AK"
+ pub const AksaraPrebase: LineBreak = LineBreak(44); // name=AP"
+ pub const AksaraStart: LineBreak = LineBreak(45); // name=AS"
+ pub const ViramaFinal: LineBreak = LineBreak(46); // name=VF"
+ pub const Virama: LineBreak = LineBreak(47); // name=VI"
+}
+
+impl_value_getter! {
+ markers: LineBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_LB_V1, LineBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_LB_V1, LineBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_LB_V1;
+ impl LineBreak {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Line_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::LineBreak;
+ ///
+ /// let lookup = LineBreak::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("BK"), Some(LineBreak::MandatoryBreak));
+ /// assert_eq!(lookup.get_strict("AL"), Some(LineBreak::Alphabetic));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Mandatory_Break"), Some(LineBreak::MandatoryBreak));
+ /// assert_eq!(lookup.get_strict("Alphabetic"), Some(LineBreak::Alphabetic));
+ /// // name has incorrect casing and dash instead of underscore
+ /// assert_eq!(lookup.get_strict("mandatory-Break"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("mandatory-Break"), Some(LineBreak::MandatoryBreak));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Stochastic_Break"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Line_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::LineBreak;
+ ///
+ /// let lookup = LineBreak::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(LineBreak::MandatoryBreak), Some("BK"));
+ /// assert_eq!(lookup.get(LineBreak::Alphabetic), Some("AL"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `Line_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::LineBreak;
+ ///
+ /// let lookup = LineBreak::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(LineBreak::MandatoryBreak), Some("Mandatory_Break"));
+ /// assert_eq!(lookup.get(LineBreak::Alphabetic), Some("Alphabetic"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+/// Enumerated property Grapheme_Cluster_Break.
+///
+/// See "Default Grapheme Cluster Boundary Specification" in UAX #29 for the
+/// summary of each property value:
+/// <https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table>
+///
+/// The numeric value is compatible with `UGraphemeClusterBreak` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // this type is stable
+#[repr(transparent)]
+#[zerovec::make_ule(GraphemeClusterBreakULE)]
+pub struct GraphemeClusterBreak(pub u8);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl GraphemeClusterBreak {
+ pub const Other: GraphemeClusterBreak = GraphemeClusterBreak(0); // name="XX"
+ pub const Control: GraphemeClusterBreak = GraphemeClusterBreak(1); // name="CN"
+ pub const CR: GraphemeClusterBreak = GraphemeClusterBreak(2); // name="CR"
+ pub const Extend: GraphemeClusterBreak = GraphemeClusterBreak(3); // name="EX"
+ pub const L: GraphemeClusterBreak = GraphemeClusterBreak(4); // name="L"
+ pub const LF: GraphemeClusterBreak = GraphemeClusterBreak(5); // name="LF"
+ pub const LV: GraphemeClusterBreak = GraphemeClusterBreak(6); // name="LV"
+ pub const LVT: GraphemeClusterBreak = GraphemeClusterBreak(7); // name="LVT"
+ pub const T: GraphemeClusterBreak = GraphemeClusterBreak(8); // name="T"
+ pub const V: GraphemeClusterBreak = GraphemeClusterBreak(9); // name="V"
+ pub const SpacingMark: GraphemeClusterBreak = GraphemeClusterBreak(10); // name="SM"
+ pub const Prepend: GraphemeClusterBreak = GraphemeClusterBreak(11); // name="PP"
+ pub const RegionalIndicator: GraphemeClusterBreak = GraphemeClusterBreak(12); // name="RI"
+ /// This value is obsolete and unused.
+ pub const EBase: GraphemeClusterBreak = GraphemeClusterBreak(13); // name="EB"
+ /// This value is obsolete and unused.
+ pub const EBaseGAZ: GraphemeClusterBreak = GraphemeClusterBreak(14); // name="EBG"
+ /// This value is obsolete and unused.
+ pub const EModifier: GraphemeClusterBreak = GraphemeClusterBreak(15); // name="EM"
+ /// This value is obsolete and unused.
+ pub const GlueAfterZwj: GraphemeClusterBreak = GraphemeClusterBreak(16); // name="GAZ"
+ pub const ZWJ: GraphemeClusterBreak = GraphemeClusterBreak(17); // name="ZWJ"
+}
+
+impl_value_getter! {
+ markers: GraphemeClusterBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_GCB_V1, GraphemeClusterBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_GCB_V1, GraphemeClusterBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_GCB_V1;
+ impl GraphemeClusterBreak {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Grapheme_Cluster_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GraphemeClusterBreak;
+ ///
+ /// let lookup = GraphemeClusterBreak::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("EX"), Some(GraphemeClusterBreak::Extend));
+ /// assert_eq!(lookup.get_strict("RI"), Some(GraphemeClusterBreak::RegionalIndicator));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Extend"), Some(GraphemeClusterBreak::Extend));
+ /// assert_eq!(lookup.get_strict("Regional_Indicator"), Some(GraphemeClusterBreak::RegionalIndicator));
+ /// // name has incorrect casing and lacks an underscore
+ /// assert_eq!(lookup.get_strict("regionalindicator"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("regionalindicator"), Some(GraphemeClusterBreak::RegionalIndicator));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Regional_Indicator_Two_Point_Oh"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Grapheme_Cluster_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GraphemeClusterBreak;
+ ///
+ /// let lookup = GraphemeClusterBreak::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(GraphemeClusterBreak::Extend), Some("EX"));
+ /// assert_eq!(lookup.get(GraphemeClusterBreak::RegionalIndicator), Some("RI"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `Grapheme_Cluster_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::GraphemeClusterBreak;
+ ///
+ /// let lookup = GraphemeClusterBreak::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(GraphemeClusterBreak::Extend), Some("Extend"));
+ /// assert_eq!(lookup.get(GraphemeClusterBreak::RegionalIndicator), Some("Regional_Indicator"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+/// Enumerated property Word_Break.
+///
+/// See "Default Word Boundary Specification" in UAX #29 for the summary of
+/// each property value:
+/// <https://www.unicode.org/reports/tr29/#Default_Word_Boundaries>.
+///
+/// The numeric value is compatible with `UWordBreakValues` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(WordBreakULE)]
+pub struct WordBreak(pub u8);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl WordBreak {
+ pub const Other: WordBreak = WordBreak(0); // name="XX"
+ pub const ALetter: WordBreak = WordBreak(1); // name="LE"
+ pub const Format: WordBreak = WordBreak(2); // name="FO"
+ pub const Katakana: WordBreak = WordBreak(3); // name="KA"
+ pub const MidLetter: WordBreak = WordBreak(4); // name="ML"
+ pub const MidNum: WordBreak = WordBreak(5); // name="MN"
+ pub const Numeric: WordBreak = WordBreak(6); // name="NU"
+ pub const ExtendNumLet: WordBreak = WordBreak(7); // name="EX"
+ pub const CR: WordBreak = WordBreak(8); // name="CR"
+ pub const Extend: WordBreak = WordBreak(9); // name="Extend"
+ pub const LF: WordBreak = WordBreak(10); // name="LF"
+ pub const MidNumLet: WordBreak = WordBreak(11); // name="MB"
+ pub const Newline: WordBreak = WordBreak(12); // name="NL"
+ pub const RegionalIndicator: WordBreak = WordBreak(13); // name="RI"
+ pub const HebrewLetter: WordBreak = WordBreak(14); // name="HL"
+ pub const SingleQuote: WordBreak = WordBreak(15); // name="SQ"
+ pub const DoubleQuote: WordBreak = WordBreak(16); // name=DQ
+ /// This value is obsolete and unused.
+ pub const EBase: WordBreak = WordBreak(17); // name="EB"
+ /// This value is obsolete and unused.
+ pub const EBaseGAZ: WordBreak = WordBreak(18); // name="EBG"
+ /// This value is obsolete and unused.
+ pub const EModifier: WordBreak = WordBreak(19); // name="EM"
+ /// This value is obsolete and unused.
+ pub const GlueAfterZwj: WordBreak = WordBreak(20); // name="GAZ"
+ pub const ZWJ: WordBreak = WordBreak(21); // name="ZWJ"
+ pub const WSegSpace: WordBreak = WordBreak(22); // name="WSegSpace"
+}
+
+impl_value_getter! {
+ markers: WordBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_WB_V1, WordBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_WB_V1, WordBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_WB_V1;
+ impl WordBreak {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Word_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::WordBreak;
+ ///
+ /// let lookup = WordBreak::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("KA"), Some(WordBreak::Katakana));
+ /// assert_eq!(lookup.get_strict("LE"), Some(WordBreak::ALetter));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Katakana"), Some(WordBreak::Katakana));
+ /// assert_eq!(lookup.get_strict("ALetter"), Some(WordBreak::ALetter));
+ /// // name has incorrect casing
+ /// assert_eq!(lookup.get_strict("Aletter"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("Aletter"), Some(WordBreak::ALetter));
+ /// assert_eq!(lookup.get_loose("w_seg_space"), Some(WordBreak::WSegSpace));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Quadruple_Quote"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Word_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::WordBreak;
+ ///
+ /// let lookup = WordBreak::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(WordBreak::Katakana), Some("KA"));
+ /// assert_eq!(lookup.get(WordBreak::ALetter), Some("LE"));
+ /// assert_eq!(lookup.get(WordBreak::WSegSpace), Some("WSegSpace"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `Word_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::WordBreak;
+ ///
+ /// let lookup = WordBreak::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(WordBreak::Katakana), Some("Katakana"));
+ /// assert_eq!(lookup.get(WordBreak::ALetter), Some("ALetter"));
+ /// assert_eq!(lookup.get(WordBreak::WSegSpace), Some("WSegSpace"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+
+/// Enumerated property Sentence_Break.
+/// See "Default Sentence Boundary Specification" in UAX #29 for the summary of
+/// each property value:
+/// <https://www.unicode.org/reports/tr29/#Default_Word_Boundaries>.
+///
+/// The numeric value is compatible with `USentenceBreak` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(SentenceBreakULE)]
+pub struct SentenceBreak(pub u8);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl SentenceBreak {
+ pub const Other: SentenceBreak = SentenceBreak(0); // name="XX"
+ pub const ATerm: SentenceBreak = SentenceBreak(1); // name="AT"
+ pub const Close: SentenceBreak = SentenceBreak(2); // name="CL"
+ pub const Format: SentenceBreak = SentenceBreak(3); // name="FO"
+ pub const Lower: SentenceBreak = SentenceBreak(4); // name="LO"
+ pub const Numeric: SentenceBreak = SentenceBreak(5); // name="NU"
+ pub const OLetter: SentenceBreak = SentenceBreak(6); // name="LE"
+ pub const Sep: SentenceBreak = SentenceBreak(7); // name="SE"
+ pub const Sp: SentenceBreak = SentenceBreak(8); // name="SP"
+ pub const STerm: SentenceBreak = SentenceBreak(9); // name="ST"
+ pub const Upper: SentenceBreak = SentenceBreak(10); // name="UP"
+ pub const CR: SentenceBreak = SentenceBreak(11); // name="CR"
+ pub const Extend: SentenceBreak = SentenceBreak(12); // name="EX"
+ pub const LF: SentenceBreak = SentenceBreak(13); // name="LF"
+ pub const SContinue: SentenceBreak = SentenceBreak(14); // name="SC"
+}
+
+impl_value_getter! {
+ markers: SentenceBreakNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_SB_V1, SentenceBreakValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_SB_V1, SentenceBreakValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_SB_V1;
+ impl SentenceBreak {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Sentence_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::SentenceBreak;
+ ///
+ /// let lookup = SentenceBreak::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("FO"), Some(SentenceBreak::Format));
+ /// assert_eq!(lookup.get_strict("NU"), Some(SentenceBreak::Numeric));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Format"), Some(SentenceBreak::Format));
+ /// assert_eq!(lookup.get_strict("Numeric"), Some(SentenceBreak::Numeric));
+ /// // name has incorrect casing
+ /// assert_eq!(lookup.get_strict("fOrmat"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("fOrmat"), Some(SentenceBreak::Format));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Fixer_Upper"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Sentence_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::SentenceBreak;
+ ///
+ /// let lookup = SentenceBreak::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(SentenceBreak::Format), Some("FO"));
+ /// assert_eq!(lookup.get(SentenceBreak::Numeric), Some("NU"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `Sentence_Break` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::SentenceBreak;
+ ///
+ /// let lookup = SentenceBreak::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(SentenceBreak::Format), Some("Format"));
+ /// assert_eq!(lookup.get(SentenceBreak::Numeric), Some("Numeric"));
+ /// assert_eq!(lookup.get(SentenceBreak::SContinue), Some("SContinue"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
+/// Property Canonical_Combining_Class.
+/// See UAX #15:
+/// <https://www.unicode.org/reports/tr15/>.
+///
+/// See `icu_normalizer::properties::CanonicalCombiningClassMap` for the API
+/// to look up the Canonical_Combining_Class property by scalar value.
+//
+// NOTE: The Pernosco debugger has special knowledge
+// of this struct. Please do not change the bit layout
+// or the crate-module-qualified name of this struct
+// without coordination.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(CanonicalCombiningClassULE)]
+pub struct CanonicalCombiningClass(pub u8);
+
+// These constant names come from PropertyValueAliases.txt
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl CanonicalCombiningClass {
+ pub const NotReordered: CanonicalCombiningClass = CanonicalCombiningClass(0); // name="NR"
+ pub const Overlay: CanonicalCombiningClass = CanonicalCombiningClass(1); // name="OV"
+ pub const HanReading: CanonicalCombiningClass = CanonicalCombiningClass(6); // name="HANR"
+ pub const Nukta: CanonicalCombiningClass = CanonicalCombiningClass(7); // name="NK"
+ pub const KanaVoicing: CanonicalCombiningClass = CanonicalCombiningClass(8); // name="KV"
+ pub const Virama: CanonicalCombiningClass = CanonicalCombiningClass(9); // name="VR"
+ pub const CCC10: CanonicalCombiningClass = CanonicalCombiningClass(10); // name="CCC10"
+ pub const CCC11: CanonicalCombiningClass = CanonicalCombiningClass(11); // name="CCC11"
+ pub const CCC12: CanonicalCombiningClass = CanonicalCombiningClass(12); // name="CCC12"
+ pub const CCC13: CanonicalCombiningClass = CanonicalCombiningClass(13); // name="CCC13"
+ pub const CCC14: CanonicalCombiningClass = CanonicalCombiningClass(14); // name="CCC14"
+ pub const CCC15: CanonicalCombiningClass = CanonicalCombiningClass(15); // name="CCC15"
+ pub const CCC16: CanonicalCombiningClass = CanonicalCombiningClass(16); // name="CCC16"
+ pub const CCC17: CanonicalCombiningClass = CanonicalCombiningClass(17); // name="CCC17"
+ pub const CCC18: CanonicalCombiningClass = CanonicalCombiningClass(18); // name="CCC18"
+ pub const CCC19: CanonicalCombiningClass = CanonicalCombiningClass(19); // name="CCC19"
+ pub const CCC20: CanonicalCombiningClass = CanonicalCombiningClass(20); // name="CCC20"
+ pub const CCC21: CanonicalCombiningClass = CanonicalCombiningClass(21); // name="CCC21"
+ pub const CCC22: CanonicalCombiningClass = CanonicalCombiningClass(22); // name="CCC22"
+ pub const CCC23: CanonicalCombiningClass = CanonicalCombiningClass(23); // name="CCC23"
+ pub const CCC24: CanonicalCombiningClass = CanonicalCombiningClass(24); // name="CCC24"
+ pub const CCC25: CanonicalCombiningClass = CanonicalCombiningClass(25); // name="CCC25"
+ pub const CCC26: CanonicalCombiningClass = CanonicalCombiningClass(26); // name="CCC26"
+ pub const CCC27: CanonicalCombiningClass = CanonicalCombiningClass(27); // name="CCC27"
+ pub const CCC28: CanonicalCombiningClass = CanonicalCombiningClass(28); // name="CCC28"
+ pub const CCC29: CanonicalCombiningClass = CanonicalCombiningClass(29); // name="CCC29"
+ pub const CCC30: CanonicalCombiningClass = CanonicalCombiningClass(30); // name="CCC30"
+ pub const CCC31: CanonicalCombiningClass = CanonicalCombiningClass(31); // name="CCC31"
+ pub const CCC32: CanonicalCombiningClass = CanonicalCombiningClass(32); // name="CCC32"
+ pub const CCC33: CanonicalCombiningClass = CanonicalCombiningClass(33); // name="CCC33"
+ pub const CCC34: CanonicalCombiningClass = CanonicalCombiningClass(34); // name="CCC34"
+ pub const CCC35: CanonicalCombiningClass = CanonicalCombiningClass(35); // name="CCC35"
+ pub const CCC36: CanonicalCombiningClass = CanonicalCombiningClass(36); // name="CCC36"
+ pub const CCC84: CanonicalCombiningClass = CanonicalCombiningClass(84); // name="CCC84"
+ pub const CCC91: CanonicalCombiningClass = CanonicalCombiningClass(91); // name="CCC91"
+ pub const CCC103: CanonicalCombiningClass = CanonicalCombiningClass(103); // name="CCC103"
+ pub const CCC107: CanonicalCombiningClass = CanonicalCombiningClass(107); // name="CCC107"
+ pub const CCC118: CanonicalCombiningClass = CanonicalCombiningClass(118); // name="CCC118"
+ pub const CCC122: CanonicalCombiningClass = CanonicalCombiningClass(122); // name="CCC122"
+ pub const CCC129: CanonicalCombiningClass = CanonicalCombiningClass(129); // name="CCC129"
+ pub const CCC130: CanonicalCombiningClass = CanonicalCombiningClass(130); // name="CCC130"
+ pub const CCC132: CanonicalCombiningClass = CanonicalCombiningClass(132); // name="CCC132"
+ pub const CCC133: CanonicalCombiningClass = CanonicalCombiningClass(133); // name="CCC133" // RESERVED
+ pub const AttachedBelowLeft: CanonicalCombiningClass = CanonicalCombiningClass(200); // name="ATBL"
+ pub const AttachedBelow: CanonicalCombiningClass = CanonicalCombiningClass(202); // name="ATB"
+ pub const AttachedAbove: CanonicalCombiningClass = CanonicalCombiningClass(214); // name="ATA"
+ pub const AttachedAboveRight: CanonicalCombiningClass = CanonicalCombiningClass(216); // name="ATAR"
+ pub const BelowLeft: CanonicalCombiningClass = CanonicalCombiningClass(218); // name="BL"
+ pub const Below: CanonicalCombiningClass = CanonicalCombiningClass(220); // name="B"
+ pub const BelowRight: CanonicalCombiningClass = CanonicalCombiningClass(222); // name="BR"
+ pub const Left: CanonicalCombiningClass = CanonicalCombiningClass(224); // name="L"
+ pub const Right: CanonicalCombiningClass = CanonicalCombiningClass(226); // name="R"
+ pub const AboveLeft: CanonicalCombiningClass = CanonicalCombiningClass(228); // name="AL"
+ pub const Above: CanonicalCombiningClass = CanonicalCombiningClass(230); // name="A"
+ pub const AboveRight: CanonicalCombiningClass = CanonicalCombiningClass(232); // name="AR"
+ pub const DoubleBelow: CanonicalCombiningClass = CanonicalCombiningClass(233); // name="DB"
+ pub const DoubleAbove: CanonicalCombiningClass = CanonicalCombiningClass(234); // name="DA"
+ pub const IotaSubscript: CanonicalCombiningClass = CanonicalCombiningClass(240); // name="IS"
+}
+
+impl_value_getter! {
+ markers: CanonicalCombiningClassNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_CCC_V1, CanonicalCombiningClassValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_SPARSE_CCC_V1, CanonicalCombiningClassValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_SPARSE_CCC_V1;
+ impl CanonicalCombiningClass {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Canonical_Combining_Class` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::CanonicalCombiningClass;
+ ///
+ /// let lookup = CanonicalCombiningClass::name_to_enum_mapper();
+ /// // short name for value
+ /// assert_eq!(lookup.get_strict("AL"), Some(CanonicalCombiningClass::AboveLeft));
+ /// assert_eq!(lookup.get_strict("ATBL"), Some(CanonicalCombiningClass::AttachedBelowLeft));
+ /// assert_eq!(lookup.get_strict("CCC10"), Some(CanonicalCombiningClass::CCC10));
+ /// // long name for value
+ /// assert_eq!(lookup.get_strict("Above_Left"), Some(CanonicalCombiningClass::AboveLeft));
+ /// assert_eq!(lookup.get_strict("Attached_Below_Left"), Some(CanonicalCombiningClass::AttachedBelowLeft));
+ /// // name has incorrect casing and hyphens
+ /// assert_eq!(lookup.get_strict("attached-below-left"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("attached-below-left"), Some(CanonicalCombiningClass::AttachedBelowLeft));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Linear_Z"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameSparseMapper`], capable of looking up short names
+ /// for values of the `Canonical_Combining_Class` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::CanonicalCombiningClass;
+ ///
+ /// let lookup = CanonicalCombiningClass::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(CanonicalCombiningClass::AboveLeft), Some("AL"));
+ /// assert_eq!(lookup.get(CanonicalCombiningClass::AttachedBelowLeft), Some("ATBL"));
+ /// assert_eq!(lookup.get(CanonicalCombiningClass::CCC10), Some("CCC10"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameSparseMapper / PropertyEnumToValueNameSparseMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameSparseMapper`], capable of looking up long names
+ /// for values of the `Canonical_Combining_Class` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::CanonicalCombiningClass;
+ ///
+ /// let lookup = CanonicalCombiningClass::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(CanonicalCombiningClass::AboveLeft), Some("Above_Left"));
+ /// assert_eq!(lookup.get(CanonicalCombiningClass::AttachedBelowLeft), Some("Attached_Below_Left"));
+ /// assert_eq!(lookup.get(CanonicalCombiningClass::CCC10), Some("CCC10"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameSparseMapper / PropertyEnumToValueNameSparseMapperBorrowed;
+ }
+}
+
+/// Property Indic_Syllabic_Category.
+/// See UAX #44:
+/// <https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category>.
+///
+/// The numeric value is compatible with `UIndicSyllabicCategory` in ICU4C.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties))]
+#[allow(clippy::exhaustive_structs)] // newtype
+#[repr(transparent)]
+#[zerovec::make_ule(IndicSyllabicCategoryULE)]
+pub struct IndicSyllabicCategory(pub u8);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+impl IndicSyllabicCategory {
+ pub const Other: IndicSyllabicCategory = IndicSyllabicCategory(0);
+ pub const Avagraha: IndicSyllabicCategory = IndicSyllabicCategory(1);
+ pub const Bindu: IndicSyllabicCategory = IndicSyllabicCategory(2);
+ pub const BrahmiJoiningNumber: IndicSyllabicCategory = IndicSyllabicCategory(3);
+ pub const CantillationMark: IndicSyllabicCategory = IndicSyllabicCategory(4);
+ pub const Consonant: IndicSyllabicCategory = IndicSyllabicCategory(5);
+ pub const ConsonantDead: IndicSyllabicCategory = IndicSyllabicCategory(6);
+ pub const ConsonantFinal: IndicSyllabicCategory = IndicSyllabicCategory(7);
+ pub const ConsonantHeadLetter: IndicSyllabicCategory = IndicSyllabicCategory(8);
+ pub const ConsonantInitialPostfixed: IndicSyllabicCategory = IndicSyllabicCategory(9);
+ pub const ConsonantKiller: IndicSyllabicCategory = IndicSyllabicCategory(10);
+ pub const ConsonantMedial: IndicSyllabicCategory = IndicSyllabicCategory(11);
+ pub const ConsonantPlaceholder: IndicSyllabicCategory = IndicSyllabicCategory(12);
+ pub const ConsonantPrecedingRepha: IndicSyllabicCategory = IndicSyllabicCategory(13);
+ pub const ConsonantPrefixed: IndicSyllabicCategory = IndicSyllabicCategory(14);
+ pub const ConsonantSucceedingRepha: IndicSyllabicCategory = IndicSyllabicCategory(15);
+ pub const ConsonantSubjoined: IndicSyllabicCategory = IndicSyllabicCategory(16);
+ pub const ConsonantWithStacker: IndicSyllabicCategory = IndicSyllabicCategory(17);
+ pub const GeminationMark: IndicSyllabicCategory = IndicSyllabicCategory(18);
+ pub const InvisibleStacker: IndicSyllabicCategory = IndicSyllabicCategory(19);
+ pub const Joiner: IndicSyllabicCategory = IndicSyllabicCategory(20);
+ pub const ModifyingLetter: IndicSyllabicCategory = IndicSyllabicCategory(21);
+ pub const NonJoiner: IndicSyllabicCategory = IndicSyllabicCategory(22);
+ pub const Nukta: IndicSyllabicCategory = IndicSyllabicCategory(23);
+ pub const Number: IndicSyllabicCategory = IndicSyllabicCategory(24);
+ pub const NumberJoiner: IndicSyllabicCategory = IndicSyllabicCategory(25);
+ pub const PureKiller: IndicSyllabicCategory = IndicSyllabicCategory(26);
+ pub const RegisterShifter: IndicSyllabicCategory = IndicSyllabicCategory(27);
+ pub const SyllableModifier: IndicSyllabicCategory = IndicSyllabicCategory(28);
+ pub const ToneLetter: IndicSyllabicCategory = IndicSyllabicCategory(29);
+ pub const ToneMark: IndicSyllabicCategory = IndicSyllabicCategory(30);
+ pub const Virama: IndicSyllabicCategory = IndicSyllabicCategory(31);
+ pub const Visarga: IndicSyllabicCategory = IndicSyllabicCategory(32);
+ pub const Vowel: IndicSyllabicCategory = IndicSyllabicCategory(33);
+ pub const VowelDependent: IndicSyllabicCategory = IndicSyllabicCategory(34);
+ pub const VowelIndependent: IndicSyllabicCategory = IndicSyllabicCategory(35);
+}
+
+impl_value_getter! {
+ markers: IndicSyllabicCategoryNameToValueV1Marker / SINGLETON_PROPNAMES_FROM_INSC_V1, IndicSyllabicCategoryValueToShortNameV1Marker / SINGLETON_PROPNAMES_TO_SHORT_LINEAR_INSC_V1, IndicSyllabicCategoryValueToLongNameV1Marker / SINGLETON_PROPNAMES_TO_LONG_LINEAR_INSC_V1;
+ impl IndicSyllabicCategory {
+ /// Return a [`PropertyValueNameToEnumMapper`], capable of looking up values
+ /// from strings for the `Indic_Syllabic_Category` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::IndicSyllabicCategory;
+ ///
+ /// let lookup = IndicSyllabicCategory::name_to_enum_mapper();
+ /// // long/short name for value
+ /// assert_eq!(lookup.get_strict("Brahmi_Joining_Number"), Some(IndicSyllabicCategory::BrahmiJoiningNumber));
+ /// assert_eq!(lookup.get_strict("Vowel_Independent"), Some(IndicSyllabicCategory::VowelIndependent));
+ /// // name has incorrect casing and hyphens
+ /// assert_eq!(lookup.get_strict("brahmi-joining-number"), None);
+ /// // loose matching of name
+ /// assert_eq!(lookup.get_loose("brahmi-joining-number"), Some(IndicSyllabicCategory::BrahmiJoiningNumber));
+ /// // fake property
+ /// assert_eq!(lookup.get_strict("Tone_Number"), None);
+ /// ```
+ pub fn get_name_to_enum_mapper() / name_to_enum_mapper();
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up short names
+ /// for values of the `Indic_Syllabic_Category` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::IndicSyllabicCategory;
+ ///
+ /// let lookup = IndicSyllabicCategory::enum_to_short_name_mapper();
+ /// assert_eq!(lookup.get(IndicSyllabicCategory::BrahmiJoiningNumber), Some("Brahmi_Joining_Number"));
+ /// assert_eq!(lookup.get(IndicSyllabicCategory::VowelIndependent), Some("Vowel_Independent"));
+ /// ```
+ pub fn get_enum_to_short_name_mapper() / enum_to_short_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ /// Return a [`PropertyEnumToValueNameLinearMapper`], capable of looking up long names
+ /// for values of the `Indic_Syllabic_Category` enumerated property.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::IndicSyllabicCategory;
+ ///
+ /// let lookup = IndicSyllabicCategory::enum_to_long_name_mapper();
+ /// assert_eq!(lookup.get(IndicSyllabicCategory::BrahmiJoiningNumber), Some("Brahmi_Joining_Number"));
+ /// assert_eq!(lookup.get(IndicSyllabicCategory::VowelIndependent), Some("Vowel_Independent"));
+ /// ```
+ pub fn get_enum_to_long_name_mapper() / enum_to_long_name_mapper() -> PropertyEnumToValueNameLinearMapper / PropertyEnumToValueNameLinearMapperBorrowed;
+ }
+}
diff --git a/third_party/rust/icu_properties/src/provider.rs b/third_party/rust/icu_properties/src/provider.rs
new file mode 100644
index 0000000000..53fb2d5fd7
--- /dev/null
+++ b/third_party/rust/icu_properties/src/provider.rs
@@ -0,0 +1,900 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+// Provider structs must be stable
+#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
+
+//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
+//!
+//! <div class="stab unstable">
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
+//! to be stable, their Rust representation might not be. Use with caution.
+//! </div>
+//!
+//! Read more about data providers: [`icu_provider`]
+
+pub mod names;
+
+use crate::script::ScriptWithExt;
+use crate::Script;
+
+use core::ops::RangeInclusive;
+use core::str;
+use icu_collections::codepointinvlist::CodePointInversionList;
+use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
+use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
+use icu_provider::prelude::*;
+use icu_provider::{DataKeyMetadata, FallbackPriority};
+use zerofrom::ZeroFrom;
+
+use zerovec::{VarZeroVec, ZeroSlice, ZeroVecError};
+
+#[cfg(feature = "compiled_data")]
+#[derive(Debug)]
+/// Baked data
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
+/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
+/// </div>
+pub struct Baked;
+
+#[cfg(feature = "compiled_data")]
+const _: () = {
+ pub mod icu {
+ pub use crate as properties;
+ pub use icu_collections as collections;
+ pub use icu_locid_transform as locid_transform;
+ }
+ icu_properties_data::make_provider!(Baked);
+ icu_properties_data::impl_propnames_from_gcb_v1!(Baked);
+ icu_properties_data::impl_propnames_from_bc_v1!(Baked);
+ icu_properties_data::impl_propnames_from_ccc_v1!(Baked);
+ icu_properties_data::impl_propnames_from_ea_v1!(Baked);
+ icu_properties_data::impl_propnames_from_gc_v1!(Baked);
+ icu_properties_data::impl_propnames_from_gcm_v1!(Baked);
+ icu_properties_data::impl_propnames_from_insc_v1!(Baked);
+ icu_properties_data::impl_propnames_from_lb_v1!(Baked);
+ icu_properties_data::impl_propnames_from_sb_v1!(Baked);
+ icu_properties_data::impl_propnames_from_sc_v1!(Baked);
+ icu_properties_data::impl_propnames_from_wb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_bc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_ea_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_gc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_gcb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_insc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_lb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_sb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_sc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_linear_wb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_long_sparse_ccc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_bc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_ea_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_gc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_gcb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_insc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_lb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_sb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear_wb_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_linear4_sc_v1!(Baked);
+ icu_properties_data::impl_propnames_to_short_sparse_ccc_v1!(Baked);
+ icu_properties_data::impl_props_ahex_v1!(Baked);
+ icu_properties_data::impl_props_alnum_v1!(Baked);
+ icu_properties_data::impl_props_alpha_v1!(Baked);
+ icu_properties_data::impl_props_basic_emoji_v1!(Baked);
+ icu_properties_data::impl_props_bc_v1!(Baked);
+ icu_properties_data::impl_props_bidi_c_v1!(Baked);
+ icu_properties_data::impl_props_bidi_m_v1!(Baked);
+ icu_properties_data::impl_props_bidiauxiliaryprops_v1!(Baked);
+ icu_properties_data::impl_props_blank_v1!(Baked);
+ icu_properties_data::impl_props_cased_v1!(Baked);
+ icu_properties_data::impl_props_ccc_v1!(Baked);
+ icu_properties_data::impl_props_ci_v1!(Baked);
+ icu_properties_data::impl_props_comp_ex_v1!(Baked);
+ icu_properties_data::impl_props_cwcf_v1!(Baked);
+ icu_properties_data::impl_props_cwcm_v1!(Baked);
+ icu_properties_data::impl_props_cwkcf_v1!(Baked);
+ icu_properties_data::impl_props_cwl_v1!(Baked);
+ icu_properties_data::impl_props_cwt_v1!(Baked);
+ icu_properties_data::impl_props_cwu_v1!(Baked);
+ icu_properties_data::impl_props_dash_v1!(Baked);
+ icu_properties_data::impl_props_dep_v1!(Baked);
+ icu_properties_data::impl_props_di_v1!(Baked);
+ icu_properties_data::impl_props_dia_v1!(Baked);
+ icu_properties_data::impl_props_ea_v1!(Baked);
+ icu_properties_data::impl_props_ebase_v1!(Baked);
+ icu_properties_data::impl_props_ecomp_v1!(Baked);
+ icu_properties_data::impl_props_emod_v1!(Baked);
+ icu_properties_data::impl_props_emoji_v1!(Baked);
+ icu_properties_data::impl_props_epres_v1!(Baked);
+ icu_properties_data::impl_props_exemplarchars_auxiliary_v1!(Baked);
+ icu_properties_data::impl_props_exemplarchars_index_v1!(Baked);
+ icu_properties_data::impl_props_exemplarchars_main_v1!(Baked);
+ icu_properties_data::impl_props_exemplarchars_numbers_v1!(Baked);
+ icu_properties_data::impl_props_exemplarchars_punctuation_v1!(Baked);
+ icu_properties_data::impl_props_ext_v1!(Baked);
+ icu_properties_data::impl_props_extpict_v1!(Baked);
+ icu_properties_data::impl_props_gc_v1!(Baked);
+ icu_properties_data::impl_props_gcb_v1!(Baked);
+ icu_properties_data::impl_props_gr_base_v1!(Baked);
+ icu_properties_data::impl_props_gr_ext_v1!(Baked);
+ icu_properties_data::impl_props_gr_link_v1!(Baked);
+ icu_properties_data::impl_props_graph_v1!(Baked);
+ icu_properties_data::impl_props_hex_v1!(Baked);
+ icu_properties_data::impl_props_hyphen_v1!(Baked);
+ icu_properties_data::impl_props_idc_v1!(Baked);
+ icu_properties_data::impl_props_ideo_v1!(Baked);
+ icu_properties_data::impl_props_ids_v1!(Baked);
+ icu_properties_data::impl_props_idsb_v1!(Baked);
+ icu_properties_data::impl_props_idst_v1!(Baked);
+ icu_properties_data::impl_props_insc_v1!(Baked);
+ icu_properties_data::impl_props_join_c_v1!(Baked);
+ icu_properties_data::impl_props_lb_v1!(Baked);
+ icu_properties_data::impl_props_loe_v1!(Baked);
+ icu_properties_data::impl_props_lower_v1!(Baked);
+ icu_properties_data::impl_props_math_v1!(Baked);
+ icu_properties_data::impl_props_nchar_v1!(Baked);
+ icu_properties_data::impl_props_nfcinert_v1!(Baked);
+ icu_properties_data::impl_props_nfdinert_v1!(Baked);
+ icu_properties_data::impl_props_nfkcinert_v1!(Baked);
+ icu_properties_data::impl_props_nfkdinert_v1!(Baked);
+ icu_properties_data::impl_props_pat_syn_v1!(Baked);
+ icu_properties_data::impl_props_pat_ws_v1!(Baked);
+ icu_properties_data::impl_props_pcm_v1!(Baked);
+ icu_properties_data::impl_props_print_v1!(Baked);
+ icu_properties_data::impl_props_qmark_v1!(Baked);
+ icu_properties_data::impl_props_radical_v1!(Baked);
+ icu_properties_data::impl_props_ri_v1!(Baked);
+ icu_properties_data::impl_props_sb_v1!(Baked);
+ icu_properties_data::impl_props_sc_v1!(Baked);
+ icu_properties_data::impl_props_scx_v1!(Baked);
+ icu_properties_data::impl_props_sd_v1!(Baked);
+ icu_properties_data::impl_props_segstart_v1!(Baked);
+ icu_properties_data::impl_props_sensitive_v1!(Baked);
+ icu_properties_data::impl_props_sterm_v1!(Baked);
+ icu_properties_data::impl_props_term_v1!(Baked);
+ icu_properties_data::impl_props_uideo_v1!(Baked);
+ icu_properties_data::impl_props_upper_v1!(Baked);
+ icu_properties_data::impl_props_vs_v1!(Baked);
+ icu_properties_data::impl_props_wb_v1!(Baked);
+ icu_properties_data::impl_props_wspace_v1!(Baked);
+ icu_properties_data::impl_props_xdigit_v1!(Baked);
+ icu_properties_data::impl_props_xidc_v1!(Baked);
+ icu_properties_data::impl_props_xids_v1!(Baked);
+};
+
+// include the specialized structs for the compact representation of Bidi property data
+pub mod bidi_data;
+
+/// A set of characters which share a particular property value.
+///
+/// This data enum is extensible, more backends may be added in the future.
+/// Old data can be used with newer code but not vice versa.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[non_exhaustive]
+pub enum PropertyCodePointSetV1<'data> {
+ /// The set of characters, represented as an inversion list
+ InversionList(#[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionList<'data>),
+ // new variants should go BELOW existing ones
+ // Serde serializes based on variant name and index in the enum
+ // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
+}
+
+/// A map efficiently storing data about individual characters.
+///
+/// This data enum is extensible, more backends may be added in the future.
+/// Old data can be used with newer code but not vice versa.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Clone, Debug, Eq, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[non_exhaustive]
+pub enum PropertyCodePointMapV1<'data, T: TrieValue> {
+ /// A codepoint trie storing the data
+ CodePointTrie(#[cfg_attr(feature = "serde", serde(borrow))] CodePointTrie<'data, T>),
+ // new variants should go BELOW existing ones
+ // Serde serializes based on variant name and index in the enum
+ // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
+}
+
+/// A set of characters and strings which share a particular property value.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[non_exhaustive]
+pub enum PropertyUnicodeSetV1<'data> {
+ /// A set representing characters in an inversion list, and the strings in a list.
+ CPInversionListStrList(
+ #[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionListAndStringList<'data>,
+ ),
+ // new variants should go BELOW existing ones
+ // Serde serializes based on variant name and index in the enum
+ // https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
+}
+
+impl<'data> PropertyUnicodeSetV1<'data> {
+ #[inline]
+ pub(crate) fn contains(&self, s: &str) -> bool {
+ match *self {
+ Self::CPInversionListStrList(ref l) => l.contains(s),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn contains32(&self, cp: u32) -> bool {
+ match *self {
+ Self::CPInversionListStrList(ref l) => l.contains32(cp),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn contains_char(&self, ch: char) -> bool {
+ match *self {
+ Self::CPInversionListStrList(ref l) => l.contains_char(ch),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn from_code_point_inversion_list_string_list(
+ l: CodePointInversionListAndStringList<'static>,
+ ) -> Self {
+ Self::CPInversionListStrList(l)
+ }
+
+ #[inline]
+ pub(crate) fn as_code_point_inversion_list_string_list(
+ &'_ self,
+ ) -> Option<&'_ CodePointInversionListAndStringList<'data>> {
+ match *self {
+ Self::CPInversionListStrList(ref l) => Some(l),
+ // any other backing data structure that cannot return a CPInversionListStrList in O(1) time should return None
+ }
+ }
+
+ #[inline]
+ pub(crate) fn to_code_point_inversion_list_string_list(
+ &self,
+ ) -> CodePointInversionListAndStringList<'_> {
+ match *self {
+ Self::CPInversionListStrList(ref t) => ZeroFrom::zero_from(t),
+ }
+ }
+}
+
+/// A struct that efficiently stores `Script` and `Script_Extensions` property data.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[icu_provider::data_struct(marker(
+ ScriptWithExtensionsPropertyV1Marker,
+ "props/scx@1",
+ singleton
+))]
+#[derive(Debug, Eq, PartialEq, Clone)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct ScriptWithExtensionsPropertyV1<'data> {
+ /// Note: The `ScriptWithExt` values in this array will assume a 12-bit layout. The 2
+ /// higher order bits 11..10 will indicate how to deduce the Script value and
+ /// Script_Extensions value, nearly matching the representation
+ /// [in ICU](https://github.com/unicode-org/icu/blob/main/icu4c/source/common/uprops.h):
+ ///
+ /// | High order 2 bits value | Script | Script_Extensions |
+ /// |-------------------------|--------------------------------------------------------|----------------------------------------------------------------|
+ /// | 3 | First value in sub-array, index given by lower 10 bits | Sub-array excluding first value, index given by lower 10 bits |
+ /// | 2 | Script=Inherited | Entire sub-array, index given by lower 10 bits |
+ /// | 1 | Script=Common | Entire sub-array, index given by lower 10 bits |
+ /// | 0 | Value in lower 10 bits | `[ Script value ]` single-element array |
+ ///
+ /// When the lower 10 bits of the value are used as an index, that index is
+ /// used for the outer-level vector of the nested `extensions` structure.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub trie: CodePointTrie<'data, ScriptWithExt>,
+
+ /// This companion structure stores Script_Extensions values, which are
+ /// themselves arrays / vectors. This structure only stores the values for
+ /// cases in which `scx(cp) != [ sc(cp) ]`. Each sub-vector is distinct. The
+ /// sub-vector represents the Script_Extensions array value for a code point,
+ /// and may also indicate Script value, as described for the `trie` field.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub extensions: VarZeroVec<'data, ZeroSlice<Script>>,
+}
+
+impl<'data> ScriptWithExtensionsPropertyV1<'data> {
+ // This method is intended to be used by constructors of deserialized data
+ // in a data provider.
+ #[doc(hidden)]
+ pub fn new(
+ trie: CodePointTrie<'data, ScriptWithExt>,
+ extensions: VarZeroVec<'data, ZeroSlice<Script>>,
+ ) -> ScriptWithExtensionsPropertyV1<'data> {
+ ScriptWithExtensionsPropertyV1 { trie, extensions }
+ }
+}
+
+// See CodePointSetData for documentation of these functions
+impl<'data> PropertyCodePointSetV1<'data> {
+ #[inline]
+ pub(crate) fn contains(&self, ch: char) -> bool {
+ match *self {
+ Self::InversionList(ref l) => l.contains(ch),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn contains32(&self, ch: u32) -> bool {
+ match *self {
+ Self::InversionList(ref l) => l.contains32(ch),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
+ match *self {
+ Self::InversionList(ref l) => l.iter_ranges(),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn iter_ranges_complemented(
+ &self,
+ ) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
+ match *self {
+ Self::InversionList(ref l) => l.iter_ranges_complemented(),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn from_code_point_inversion_list(l: CodePointInversionList<'static>) -> Self {
+ Self::InversionList(l)
+ }
+
+ #[inline]
+ pub(crate) fn as_code_point_inversion_list(
+ &'_ self,
+ ) -> Option<&'_ CodePointInversionList<'data>> {
+ match *self {
+ Self::InversionList(ref l) => Some(l),
+ // any other backing data structure that cannot return a CPInvList in O(1) time should return None
+ }
+ }
+
+ #[inline]
+ pub(crate) fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
+ match *self {
+ Self::InversionList(ref t) => ZeroFrom::zero_from(t),
+ }
+ }
+}
+
+// See CodePointMapData for documentation of these functions
+impl<'data, T: TrieValue> PropertyCodePointMapV1<'data, T> {
+ #[inline]
+ pub(crate) fn get32(&self, ch: u32) -> T {
+ match *self {
+ Self::CodePointTrie(ref t) => t.get32(ch),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn try_into_converted<P>(
+ self,
+ ) -> Result<PropertyCodePointMapV1<'data, P>, ZeroVecError>
+ where
+ P: TrieValue,
+ {
+ match self {
+ Self::CodePointTrie(t) => t
+ .try_into_converted()
+ .map(PropertyCodePointMapV1::CodePointTrie),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
+ match *self {
+ Self::CodePointTrie(ref t) => t.get_set_for_value(value),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = CodePointMapRange<T>> + '_ {
+ match *self {
+ Self::CodePointTrie(ref t) => t.iter_ranges(),
+ }
+ }
+ #[inline]
+ pub(crate) fn iter_ranges_mapped<'a, U: Eq + 'a>(
+ &'a self,
+ map: impl FnMut(T) -> U + Copy + 'a,
+ ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
+ match *self {
+ Self::CodePointTrie(ref t) => t.iter_ranges_mapped(map),
+ }
+ }
+
+ #[inline]
+ pub(crate) fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
+ Self::CodePointTrie(trie)
+ }
+
+ #[inline]
+ pub(crate) fn as_code_point_trie(&self) -> Option<&CodePointTrie<'data, T>> {
+ match *self {
+ Self::CodePointTrie(ref t) => Some(t),
+ // any other backing data structure that cannot return a CPT in O(1) time should return None
+ }
+ }
+
+ #[inline]
+ pub(crate) fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
+ match *self {
+ Self::CodePointTrie(ref t) => ZeroFrom::zero_from(t),
+ }
+ }
+}
+
+macro_rules! expand {
+ (
+ ($(($code_point_set_marker:ident, $bin_cp_s:literal),)+),
+ ($(($unicode_set_marker:ident, $bin_us_s:literal, $us_singleton:literal),)+),
+ ($(($code_point_map_marker:ident,
+ $name_value_marker:ident,
+
+ $((sparse: $value_short_name_marker_sparse:ident, $value_long_name_marker_sparse:ident),)?
+ $((linear: $value_short_name_marker_linear:ident, $value_long_name_marker_linear:ident ),)?
+ $((linear4: $value_short_name_marker_linear4:ident, $value_long_name_marker_linear4:ident ),)?
+ $enum_s:literal, $value_ty:ident),)+)
+ ) => {
+
+ // Data keys that return code point sets (represented as CodePointSetData).
+ // For now, synonymous with binary properties of code points only.
+ $(
+ #[doc = core::concat!("Data marker for the '", $bin_cp_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $code_point_set_marker;
+
+ impl DataMarker for $code_point_set_marker {
+ type Yokeable = PropertyCodePointSetV1<'static>;
+ }
+ impl KeyedDataMarker for $code_point_set_marker {
+ const KEY: DataKey = data_key!(concat!("props/", $bin_cp_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+
+ )+
+
+ // Data keys that return sets of strings + code points (represented as UnicodeSetData).
+ // Includes:
+ // - binary properties of strings + code points
+ // - exemplar characters
+ $(
+ #[doc = core::concat!("Data marker for the '", $bin_us_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $unicode_set_marker;
+
+ impl DataMarker for $unicode_set_marker {
+ type Yokeable = PropertyUnicodeSetV1<'static>;
+ }
+ impl KeyedDataMarker for $unicode_set_marker {
+ const KEY: DataKey = data_key!(concat!("props/", $bin_us_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, $us_singleton));
+ }
+ )+
+
+ // Data keys that return code point map (represented as CodePointMapData).
+ // For now, synonymous with enumerated properties [of code points only].
+ $(
+ #[doc = core::concat!("Data marker for the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $code_point_map_marker;
+
+ impl DataMarker for $code_point_map_marker {
+ type Yokeable = PropertyCodePointMapV1<'static, crate::$value_ty>;
+ }
+
+ impl KeyedDataMarker for $code_point_map_marker {
+ const KEY: DataKey = data_key!(concat!("props/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+
+
+ #[doc = core::concat!("Data marker for parsing the names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $name_value_marker;
+
+ impl DataMarker for $name_value_marker {
+ type Yokeable = names::PropertyValueNameToEnumMapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $name_value_marker {
+ const KEY: DataKey = data_key!(concat!("propnames/from/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+
+ $(
+ #[doc = core::concat!("Data marker for producing short names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $value_short_name_marker_sparse;
+
+ impl DataMarker for $value_short_name_marker_sparse {
+ type Yokeable = names::PropertyEnumToValueNameSparseMapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $value_short_name_marker_sparse {
+ const KEY: DataKey = data_key!(concat!("propnames/to/short/sparse/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+
+ #[doc = core::concat!("Data marker for producing long names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $value_long_name_marker_sparse;
+
+ impl DataMarker for $value_long_name_marker_sparse {
+ type Yokeable = names::PropertyEnumToValueNameSparseMapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $value_long_name_marker_sparse {
+ const KEY: DataKey = data_key!(concat!("propnames/to/long/sparse/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+ )?
+
+ $(
+ #[doc = core::concat!("Data marker for producing short names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $value_short_name_marker_linear;
+
+ impl DataMarker for $value_short_name_marker_linear {
+ type Yokeable = names::PropertyEnumToValueNameLinearMapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $value_short_name_marker_linear {
+ const KEY: DataKey = data_key!(concat!("propnames/to/short/linear/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+
+ #[doc = core::concat!("Data marker for producing long names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $value_long_name_marker_linear;
+
+ impl DataMarker for $value_long_name_marker_linear {
+ type Yokeable = names::PropertyEnumToValueNameLinearMapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $value_long_name_marker_linear {
+ const KEY: DataKey = data_key!(concat!("propnames/to/long/linear/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+ )?
+
+ $(
+ #[doc = core::concat!("Data marker for producing short names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $value_short_name_marker_linear4;
+
+ impl DataMarker for $value_short_name_marker_linear4 {
+ type Yokeable = names::PropertyEnumToValueNameLinearTiny4MapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $value_short_name_marker_linear4 {
+ const KEY: DataKey = data_key!(concat!("propnames/to/short/linear4/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+
+ #[doc = core::concat!("Data marker for producing long names of the values of the '", $enum_s, "' Unicode property")]
+ #[derive(Debug, Default)]
+ #[cfg_attr(
+ feature = "datagen",
+ derive(databake::Bake),
+ databake(path = icu_properties::provider),
+ )]
+ pub struct $value_long_name_marker_linear4;
+
+ impl DataMarker for $value_long_name_marker_linear4 {
+ // Tiny4 is only for short names
+ type Yokeable = names::PropertyEnumToValueNameLinearMapV1<'static>;
+ }
+
+ impl KeyedDataMarker for $value_long_name_marker_linear4 {
+ const KEY: DataKey = data_key!(concat!("propnames/to/long/linear/", $enum_s, "@1"), DataKeyMetadata::construct_internal(FallbackPriority::Language, None, None, true));
+ }
+ )?
+ )+
+
+ /// All data keys in this module.
+ pub const KEYS: &[DataKey] = &[
+ $($code_point_set_marker::KEY,)+
+ $($unicode_set_marker::KEY,)+
+ $(
+ $code_point_map_marker::KEY,
+ $name_value_marker::KEY,
+ $($value_short_name_marker_sparse::KEY, $value_long_name_marker_sparse::KEY,)?
+ $($value_short_name_marker_linear::KEY, $value_long_name_marker_linear::KEY,)?
+ $($value_short_name_marker_linear4::KEY, $value_long_name_marker_linear4::KEY,)?
+ )+
+ bidi_data::BidiAuxiliaryPropertiesV1Marker::KEY,
+ GeneralCategoryMaskNameToValueV1Marker::KEY,
+ ScriptWithExtensionsPropertyV1Marker::KEY,
+ ];
+ };
+}
+
+pub use self::names::GeneralCategoryMaskNameToValueV1Marker;
+
+expand!(
+ (
+ // code point sets
+ (AsciiHexDigitV1Marker, "AHex"),
+ (AlnumV1Marker, "alnum"),
+ (AlphabeticV1Marker, "Alpha"),
+ (BidiControlV1Marker, "Bidi_C"),
+ (BidiMirroredV1Marker, "Bidi_M"),
+ (BlankV1Marker, "blank"),
+ (CasedV1Marker, "Cased"),
+ (CaseIgnorableV1Marker, "CI"),
+ (FullCompositionExclusionV1Marker, "Comp_Ex"),
+ (ChangesWhenCasefoldedV1Marker, "CWCF"),
+ (ChangesWhenCasemappedV1Marker, "CWCM"),
+ (ChangesWhenNfkcCasefoldedV1Marker, "CWKCF"),
+ (ChangesWhenLowercasedV1Marker, "CWL"),
+ (ChangesWhenTitlecasedV1Marker, "CWT"),
+ (ChangesWhenUppercasedV1Marker, "CWU"),
+ (DashV1Marker, "Dash"),
+ (DeprecatedV1Marker, "Dep"),
+ (DefaultIgnorableCodePointV1Marker, "DI"),
+ (DiacriticV1Marker, "Dia"),
+ (EmojiModifierBaseV1Marker, "EBase"),
+ (EmojiComponentV1Marker, "EComp"),
+ (EmojiModifierV1Marker, "EMod"),
+ (EmojiV1Marker, "Emoji"),
+ (EmojiPresentationV1Marker, "EPres"),
+ (ExtenderV1Marker, "Ext"),
+ (ExtendedPictographicV1Marker, "ExtPict"),
+ (GraphV1Marker, "graph"),
+ (GraphemeBaseV1Marker, "Gr_Base"),
+ (GraphemeExtendV1Marker, "Gr_Ext"),
+ (GraphemeLinkV1Marker, "Gr_Link"),
+ (HexDigitV1Marker, "Hex"),
+ (HyphenV1Marker, "Hyphen"),
+ (IdContinueV1Marker, "IDC"),
+ (IdeographicV1Marker, "Ideo"),
+ (IdStartV1Marker, "IDS"),
+ (IdsBinaryOperatorV1Marker, "IDSB"),
+ (IdsTrinaryOperatorV1Marker, "IDST"),
+ (JoinControlV1Marker, "Join_C"),
+ (LogicalOrderExceptionV1Marker, "LOE"),
+ (LowercaseV1Marker, "Lower"),
+ (MathV1Marker, "Math"),
+ (NoncharacterCodePointV1Marker, "NChar"),
+ (NfcInertV1Marker, "nfcinert"),
+ (NfdInertV1Marker, "nfdinert"),
+ (NfkcInertV1Marker, "nfkcinert"),
+ (NfkdInertV1Marker, "nfkdinert"),
+ (PatternSyntaxV1Marker, "Pat_Syn"),
+ (PatternWhiteSpaceV1Marker, "Pat_WS"),
+ (PrependedConcatenationMarkV1Marker, "PCM"),
+ (PrintV1Marker, "print"),
+ (QuotationMarkV1Marker, "QMark"),
+ (RadicalV1Marker, "Radical"),
+ (RegionalIndicatorV1Marker, "RI"),
+ (SoftDottedV1Marker, "SD"),
+ (SegmentStarterV1Marker, "segstart"),
+ (CaseSensitiveV1Marker, "Sensitive"),
+ (SentenceTerminalV1Marker, "STerm"),
+ (TerminalPunctuationV1Marker, "Term"),
+ (UnifiedIdeographV1Marker, "UIdeo"),
+ (UppercaseV1Marker, "Upper"),
+ (VariationSelectorV1Marker, "VS"),
+ (WhiteSpaceV1Marker, "WSpace"),
+ (XdigitV1Marker, "xdigit"),
+ (XidContinueV1Marker, "XIDC"),
+ (XidStartV1Marker, "XIDS"),
+ ),
+ (
+ // UnicodeSets (code points + strings)
+ (BasicEmojiV1Marker, "Basic_Emoji", true),
+ (ExemplarCharactersMainV1Marker, "exemplarchars/main", false),
+ (
+ ExemplarCharactersAuxiliaryV1Marker,
+ "exemplarchars/auxiliary",
+ false
+ ),
+ (
+ ExemplarCharactersPunctuationV1Marker,
+ "exemplarchars/punctuation",
+ false
+ ),
+ (
+ ExemplarCharactersNumbersV1Marker,
+ "exemplarchars/numbers",
+ false
+ ),
+ (
+ ExemplarCharactersIndexV1Marker,
+ "exemplarchars/index",
+ false
+ ),
+ ),
+ (
+ // code point maps
+ (
+ CanonicalCombiningClassV1Marker,
+ CanonicalCombiningClassNameToValueV1Marker,
+ (
+ sparse: CanonicalCombiningClassValueToShortNameV1Marker,
+ CanonicalCombiningClassValueToLongNameV1Marker
+ ),
+ "ccc",
+ CanonicalCombiningClass
+ ),
+ (
+ GeneralCategoryV1Marker,
+ GeneralCategoryNameToValueV1Marker,
+ (
+ linear: GeneralCategoryValueToShortNameV1Marker,
+ GeneralCategoryValueToLongNameV1Marker
+ ),
+ "gc",
+ GeneralCategory
+ ),
+ (
+ BidiClassV1Marker,
+ BidiClassNameToValueV1Marker,
+ (
+ linear: BidiClassValueToShortNameV1Marker,
+ BidiClassValueToLongNameV1Marker
+ ),
+ "bc",
+ BidiClass
+ ),
+ (
+ ScriptV1Marker,
+ ScriptNameToValueV1Marker,
+ (
+ linear4: ScriptValueToShortNameV1Marker,
+ ScriptValueToLongNameV1Marker
+ ),
+ "sc",
+ Script
+ ),
+ (
+ EastAsianWidthV1Marker,
+ EastAsianWidthNameToValueV1Marker,
+ (
+ linear: EastAsianWidthValueToShortNameV1Marker,
+ EastAsianWidthValueToLongNameV1Marker
+ ),
+ "ea",
+ EastAsianWidth
+ ),
+ (
+ LineBreakV1Marker,
+ LineBreakNameToValueV1Marker,
+ (
+ linear: LineBreakValueToShortNameV1Marker,
+ LineBreakValueToLongNameV1Marker
+ ),
+ "lb",
+ LineBreak
+ ),
+ (
+ GraphemeClusterBreakV1Marker,
+ GraphemeClusterBreakNameToValueV1Marker,
+ (
+ linear: GraphemeClusterBreakValueToShortNameV1Marker,
+ GraphemeClusterBreakValueToLongNameV1Marker
+ ),
+ "GCB",
+ GraphemeClusterBreak
+ ),
+ (
+ WordBreakV1Marker,
+ WordBreakNameToValueV1Marker,
+ (
+ linear: WordBreakValueToShortNameV1Marker,
+ WordBreakValueToLongNameV1Marker
+ ),
+ "WB",
+ WordBreak
+ ),
+ (
+ SentenceBreakV1Marker,
+ SentenceBreakNameToValueV1Marker,
+ (
+ linear: SentenceBreakValueToShortNameV1Marker,
+ SentenceBreakValueToLongNameV1Marker
+ ),
+ "SB",
+ SentenceBreak
+ ),
+ (
+ IndicSyllabicCategoryV1Marker,
+ IndicSyllabicCategoryNameToValueV1Marker,
+ (
+ linear: IndicSyllabicCategoryValueToShortNameV1Marker,
+ IndicSyllabicCategoryValueToLongNameV1Marker
+ ),
+ "InSC",
+ IndicSyllabicCategory
+ ),
+ // note: the names key for the GCM mask is handled above
+ )
+);
diff --git a/third_party/rust/icu_properties/src/provider/bidi_data.rs b/third_party/rust/icu_properties/src/provider/bidi_data.rs
new file mode 100644
index 0000000000..465ed4ebb7
--- /dev/null
+++ b/third_party/rust/icu_properties/src/provider/bidi_data.rs
@@ -0,0 +1,289 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
+//!
+//! <div class="stab unstable">
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
+//! to be stable, their Rust representation might not be. Use with caution.
+//! </div>
+//!
+//! Read more about data providers: [`icu_provider`]
+//!
+//! This module provides an efficient storage of data serving the following
+//! properties:
+//! - `Bidi_Paired_Bracket`
+//! - `Bidi_Paired_Bracket_Type`
+//! - `Bidi_Mirrored`
+//! - `Bidi_Mirroring_Glyph`
+
+use displaydoc::Display;
+use icu_collections::codepointtrie::{CodePointTrie, TrieValue};
+use icu_provider::prelude::*;
+use zerovec::ule::{AsULE, CharULE, ULE};
+use zerovec::ZeroVecError;
+
+/// A data provider struct for properties related to Bidi algorithms, including
+/// mirroring and bracket pairing.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[icu_provider::data_struct(marker(
+ BidiAuxiliaryPropertiesV1Marker,
+ "props/bidiauxiliaryprops@1",
+ singleton
+))]
+#[derive(Debug, Eq, PartialEq, Clone)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider::bidi_data),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct BidiAuxiliaryPropertiesV1<'data> {
+ /// A `CodePointTrie` efficiently storing the data from which property values
+ /// can be extracted or derived for the supported Bidi properties.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub trie: CodePointTrie<'data, MirroredPairedBracketData>,
+}
+
+impl<'data> BidiAuxiliaryPropertiesV1<'data> {
+ #[doc(hidden)]
+ pub fn new(
+ trie: CodePointTrie<'data, MirroredPairedBracketData>,
+ ) -> BidiAuxiliaryPropertiesV1<'data> {
+ BidiAuxiliaryPropertiesV1 { trie }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))]
+#[doc(hidden)] // needed for datagen but not intended for users
+pub struct MirroredPairedBracketData {
+ pub mirroring_glyph: char,
+ pub mirrored: bool,
+ pub paired_bracket_type: CheckedBidiPairedBracketType,
+}
+
+impl Default for MirroredPairedBracketData {
+ fn default() -> Self {
+ Self {
+ mirroring_glyph: 0 as char,
+ mirrored: false,
+ paired_bracket_type: CheckedBidiPairedBracketType::None,
+ }
+ }
+}
+
+impl From<MirroredPairedBracketData> for u32 {
+ fn from(mpbd: MirroredPairedBracketData) -> u32 {
+ let mut result = mpbd.mirroring_glyph as u32;
+ result |= (mpbd.mirrored as u32) << 21;
+ result |= (mpbd.paired_bracket_type as u32) << 22;
+ result
+ }
+}
+
+/// A `u32` serialized value of `MirroredPairedBracketData` did not encode either a valid Bidi_Mirroring_Glyph or a valid Bidi_Paired_Bracket_Type
+#[derive(Display, Debug, Clone, Copy, PartialEq, Eq)]
+#[displaydoc("Invalid MirroredPairedBracketData serialized in int: {0}")]
+pub struct MirroredPairedBracketDataTryFromError(u32);
+
+impl TryFrom<u32> for MirroredPairedBracketData {
+ type Error = MirroredPairedBracketDataTryFromError;
+
+ fn try_from(i: u32) -> Result<Self, MirroredPairedBracketDataTryFromError> {
+ let code_point = i & 0x1FFFFF;
+ let mirroring_glyph =
+ char::try_from_u32(code_point).map_err(|_| MirroredPairedBracketDataTryFromError(i))?;
+ let mirrored = ((i >> 21) & 0x1) == 1;
+ let paired_bracket_type = {
+ let value = ((i >> 22) & 0x3) as u8;
+ match value {
+ 0 => CheckedBidiPairedBracketType::None,
+ 1 => CheckedBidiPairedBracketType::Open,
+ 2 => CheckedBidiPairedBracketType::Close,
+ _ => {
+ return Err(MirroredPairedBracketDataTryFromError(i));
+ }
+ }
+ };
+ Ok(MirroredPairedBracketData {
+ mirroring_glyph,
+ mirrored,
+ paired_bracket_type,
+ })
+ }
+}
+
+/// A closed Rust enum representing a closed set of the incoming Bidi_Paired_Bracket_Type
+/// property values necessary in the internal representation of `MirroredPairedBracketData`
+/// to satisfy the ULE invariants on valid values.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))]
+#[repr(u8)]
+#[zerovec::make_ule(CheckedBidiPairedBracketTypeULE)]
+// This enum is closed in order to help with ULE validation for MirroredPairedBracketData.
+#[allow(clippy::exhaustive_enums)]
+pub enum CheckedBidiPairedBracketType {
+ /// Not a paired bracket.
+ None = 0,
+ /// Open paired bracket.
+ Open = 1,
+ /// Close paired bracket.
+ Close = 2,
+}
+
+/// Bit layout for the 24 bits (0..=23) of the `[u8; 3]` ULE raw type.
+/// LE means first byte is 0..=7, second byte 8..=15, third byte is 16..=23
+/// 0..=20 Code point return value for Bidi_Mirroring_Glyph value
+/// extracted with: mask = 0x1FFFFF <=> [bytes[0], bytes[1], bytes[2] & 0x1F]
+/// 21..=21 Boolean for Bidi_Mirrored
+/// extracted with: bitshift right by 21 followed by mask = 0x1 <=> (bytes[2] >> 5) & 0x1
+/// 22..=23 Enum discriminant value for Bidi_Paired_Bracket_Type
+/// extracted with: bitshift right by 22 followed by mask = 0x3 <=> (bytes[2] >> 6) & 0x3
+/// <=> (bytes[2] >> 6) b/c we left fill with 0s on bitshift right for unsigned
+/// numbers and a byte has 8 bits
+#[doc(hidden)]
+/// needed for datagen but not intended for users
+#[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
+#[repr(packed)]
+pub struct MirroredPairedBracketDataULE([u8; 3]);
+
+// Safety (based on the safety checklist on the ULE trait):
+// 1. MirroredPairedBracketDataULE does not include any uninitialized or padding bytes
+// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
+// 2. MirroredPairedBracketDataULE is aligned to 1 byte.
+// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
+// 3. The impl of validate_byte_slice() returns an error if any byte is not valid.
+// 4. The impl of validate_byte_slice() returns an error if there are extra bytes.
+// 5. The other ULE methods use the default impl.
+// 6. MirroredPairedBracketDataULE byte equality is semantic equality because all bits
+// are used, so no unused bits requires no extra work to zero out unused bits
+unsafe impl ULE for MirroredPairedBracketDataULE {
+ #[inline]
+ fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> {
+ if bytes.len() % 3 != 0 {
+ return Err(ZeroVecError::length::<Self>(bytes.len()));
+ }
+ // Validate the bytes
+ #[allow(clippy::indexing_slicing)] // Won't panic because the chunks are always 3 bytes long
+ for byte_triple in bytes.chunks_exact(3) {
+ // Bidi_Mirroring_Glyph validation
+ #[allow(clippy::unwrap_used)] // chunks_exact returns slices of length 3
+ let [byte0, byte1, byte2] = *<&[u8; 3]>::try_from(byte_triple).unwrap();
+ let mut mirroring_glyph_code_point: u32 = (byte2 & 0x1F) as u32;
+ mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte1 as u32);
+ mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte0 as u32);
+ let _mirroring_glyph =
+ char::from_u32(mirroring_glyph_code_point).ok_or(ZeroVecError::parse::<Self>())?;
+
+ // skip validating the Bidi_Mirrored boolean since it is always valid
+
+ // assert that Bidi_Paired_Bracket_Type cannot have a 4th value because it only
+ // has 3 values: Open, Close, None
+ if (byte2 & 0xC0) == 0xC0 {
+ return Err(ZeroVecError::parse::<Self>());
+ }
+ }
+
+ Ok(())
+ }
+}
+
+impl AsULE for MirroredPairedBracketData {
+ type ULE = MirroredPairedBracketDataULE;
+
+ #[inline]
+ fn to_unaligned(self) -> Self::ULE {
+ let mut ch = u32::from(self.mirroring_glyph);
+ ch |= u32::from(self.mirrored) << 21;
+ ch |= (self.paired_bracket_type as u32) << 22;
+ let [byte0, byte1, byte2, _] = ch.to_le_bytes();
+ MirroredPairedBracketDataULE([byte0, byte1, byte2])
+ }
+
+ #[inline]
+ fn from_unaligned(unaligned: Self::ULE) -> Self {
+ let [unaligned_byte0, unaligned_byte1, unaligned_byte2] = unaligned.0;
+ let mirroring_glyph_ule_bytes = &[unaligned_byte0, unaligned_byte1, unaligned_byte2 & 0x1F];
+ // Safe because the lower bits 20..0 of MirroredPairedBracketDataULE bytes are the CharULE bytes,
+ // and CharULE::from_unaligned is safe because bytes are defined to represent a valid Unicode code point.
+ let mirroring_glyph_ule =
+ unsafe { CharULE::from_byte_slice_unchecked(mirroring_glyph_ule_bytes) };
+ let mirroring_glyph = mirroring_glyph_ule
+ .first()
+ .map(|ule| char::from_unaligned(*ule))
+ .unwrap_or(char::REPLACEMENT_CHARACTER);
+ let mirrored = ((unaligned.0[2] >> 5) & 0x1) == 1;
+ let paired_bracket_type = {
+ let discriminant = unaligned.0[2] >> 6;
+ debug_assert!(
+ discriminant != 3,
+ "Bidi_Paired_Bracket_Type can only be Open/Close/None in MirroredPairedBracketData"
+ );
+ match discriminant {
+ 1 => CheckedBidiPairedBracketType::Open,
+ 2 => CheckedBidiPairedBracketType::Close,
+ _ => CheckedBidiPairedBracketType::None,
+ }
+ };
+
+ MirroredPairedBracketData {
+ mirroring_glyph,
+ mirrored,
+ paired_bracket_type,
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_parse() {
+ // data for U+007B LEFT CURLY BRACKET
+
+ // serialize to ULE bytes
+ let data = MirroredPairedBracketData {
+ mirroring_glyph: '}',
+ mirrored: true,
+ paired_bracket_type: CheckedBidiPairedBracketType::Open,
+ };
+ let expected_bytes = &[0x7D, 0x0, 0x60];
+ assert_eq!(
+ expected_bytes,
+ MirroredPairedBracketDataULE::as_byte_slice(&[data.to_unaligned()])
+ );
+
+ // deserialize from ULE bytes
+ let ule = MirroredPairedBracketDataULE::parse_byte_slice(expected_bytes).unwrap();
+ let parsed_data = MirroredPairedBracketData::from_unaligned(*ule.first().unwrap());
+ assert_eq!(data, parsed_data);
+ }
+
+ #[test]
+ fn test_parse_error() {
+ // data for U+007B LEFT CURLY BRACKET
+ let ule_bytes = &mut [0x7D, 0x0, 0x60];
+
+ // Set discriminant value for the CheckedBidiPairedBracketType enum to be invalid.
+ // CheckedBidiPairedBracketType only has 3 values (discriminants => 0..=2), so the 4th
+ // expressible value from the 2 bits (3) should not parse successfully.
+ ule_bytes[2] |= 0xC0;
+
+ // deserialize from ULE bytes
+ let ule_parse_result = MirroredPairedBracketDataULE::parse_byte_slice(ule_bytes);
+ assert!(ule_parse_result.is_err());
+ }
+}
diff --git a/third_party/rust/icu_properties/src/provider/names.rs b/third_party/rust/icu_properties/src/provider/names.rs
new file mode 100644
index 0000000000..f521f715ce
--- /dev/null
+++ b/third_party/rust/icu_properties/src/provider/names.rs
@@ -0,0 +1,277 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! 🚧 \[Unstable\] Property names-related data for this component
+//!
+//! <div class="stab unstable">
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
+//! to be stable, their Rust representation might not be. Use with caution.
+//! </div>
+//!
+//! Read more about data providers: [`icu_provider`]
+
+use alloc::boxed::Box;
+use core::cmp::Ordering;
+
+use core::str;
+
+use icu_provider::prelude::*;
+
+use tinystr::TinyStr4;
+use zerovec::ule::{UnvalidatedStr, VarULE};
+use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap, ZeroVec};
+
+/// This is a property name that can be "loose matched" as according to
+/// [PropertyValueAliases.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
+///
+/// (matched case-insensitively in ASCII, ignoring underscores, whitespace, and hyphens)
+///
+/// This is expected to be ASCII, but we do not rely on this invariant anywhere except during
+/// datagen.
+///
+/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
+/// will sort into the same area, such that a map can be searched for both strict and loose equality.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+///
+/// # Examples
+///
+/// Using a [`NormalizedPropertyNameStr`] as the key of a [`ZeroMap`]:
+///
+/// ```
+/// use icu_properties::provider::names::NormalizedPropertyNameStr;
+/// use zerovec::ZeroMap;
+///
+/// let map: ZeroMap<NormalizedPropertyNameStr, usize> = [
+/// (NormalizedPropertyNameStr::from_str("A_BC"), 11),
+/// (NormalizedPropertyNameStr::from_str("dEf"), 22),
+/// (NormalizedPropertyNameStr::from_str("G_H-I"), 33),
+/// ]
+/// .into_iter()
+/// .collect();
+///
+/// let key_approx = NormalizedPropertyNameStr::from_str("AB-C");
+/// let key_exact = NormalizedPropertyNameStr::from_str("A_BC");
+///
+/// // Strict lookup:
+/// assert_eq!(None, map.get_copied(key_approx));
+/// assert_eq!(Some(11), map.get_copied(key_exact));
+///
+/// // Loose lookup:
+/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_approx)));
+/// assert_eq!(Some(11), map.get_copied_by(|u| u.cmp_loose(key_exact)));
+/// ```
+#[derive(PartialEq, Eq)] // VarULE wants these to be byte equality
+#[derive(Debug, VarULE)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize))]
+#[repr(transparent)]
+pub struct NormalizedPropertyNameStr(UnvalidatedStr);
+
+/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate
+#[cfg(feature = "serde")]
+impl<'de> serde::Deserialize<'de> for Box<NormalizedPropertyNameStr> {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ <Box<UnvalidatedStr>>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_box)
+ }
+}
+
+/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate
+#[cfg(feature = "serde")]
+impl<'de, 'a> serde::Deserialize<'de> for &'a NormalizedPropertyNameStr
+where
+ 'de: 'a,
+{
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ <&UnvalidatedStr>::deserialize(deserializer).map(NormalizedPropertyNameStr::cast_ref)
+ }
+}
+
+impl<'a> ZeroMapKV<'a> for NormalizedPropertyNameStr {
+ type Container = VarZeroVec<'a, NormalizedPropertyNameStr>;
+ type Slice = VarZeroSlice<NormalizedPropertyNameStr>;
+ type GetType = NormalizedPropertyNameStr;
+ type OwnedType = Box<NormalizedPropertyNameStr>;
+}
+
+/// The Ord/PartialOrd impl will sort things using strict equality, but in such a way that all loose-equal items
+/// will sort into the same area, such that a map can be searched for both strict and loose equality.
+impl PartialOrd for NormalizedPropertyNameStr {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+/// Normalize a character based on the "loose matching" described in PropertyValueAliases.txt,
+/// returning `None` for skippable characters
+///
+/// ICU has [code for this][1] (and [during property lookup][2]) which we emulate.
+/// In particular, ICU only does normalization within ASCII, which makes sense since character names
+/// seem to be only ASCII.
+///
+/// [1]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L35
+/// [2]: https://github.com/unicode-org/icu/blob/288c4c7555915ce7b1fb675d94ddd495058fc039/icu4c/source/common/propname.cpp#L226-L230
+fn normalize_char(ch: u8) -> Option<u8> {
+ match ch {
+ // all ascii whitespace
+ ch if ch.is_ascii_whitespace() => None,
+ // underscores, hyphens, and the vertical tab character
+ // not covered by is_ascii_whitespace()
+ b'_' | b'-' | 0x0B => None,
+ // ignore case by lowercasing
+ ch => Some(ch.to_ascii_lowercase()),
+ }
+}
+
+/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
+/// will sort into the same area, such that a map can be searched for both strict and loose equality.
+impl Ord for NormalizedPropertyNameStr {
+ fn cmp(&self, other: &Self) -> Ordering {
+ let cmp = self.cmp_loose(other);
+ // When loose equality holds, fall back to strict equality
+ if cmp == Ordering::Equal {
+ self.0.cmp(&other.0)
+ } else {
+ cmp
+ }
+ }
+}
+
+impl NormalizedPropertyNameStr {
+ /// Perform the loose comparison as defined in [`NormalizedPropertyNameStr`].
+ pub fn cmp_loose(&self, other: &Self) -> Ordering {
+ let self_iter = self.0.iter().copied().filter_map(normalize_char);
+ let other_iter = other.0.iter().copied().filter_map(normalize_char);
+ self_iter.cmp(other_iter)
+ }
+
+ /// Convert a string reference to a [`NormalizedPropertyNameStr`].
+ pub const fn from_str(s: &str) -> &Self {
+ Self::cast_ref(UnvalidatedStr::from_str(s))
+ }
+
+ /// Convert a [`UnvalidatedStr`] reference to a [`NormalizedPropertyNameStr`] reference.
+ pub const fn cast_ref(value: &UnvalidatedStr) -> &Self {
+ // Safety: repr(transparent)
+ unsafe { core::mem::transmute(value) }
+ }
+
+ /// Convert a [`UnvalidatedStr`] box to a [`NormalizedPropertyNameStr`] box.
+ pub const fn cast_box(value: Box<UnvalidatedStr>) -> Box<Self> {
+ // Safety: repr(transparent)
+ unsafe { core::mem::transmute(value) }
+ }
+
+ /// Get a [`NormalizedPropertyNameStr`] box from a byte slice.
+ pub fn boxed_from_bytes(b: &[u8]) -> Box<Self> {
+ Self::cast_box(UnvalidatedStr::from_boxed_bytes(b.into()))
+ }
+}
+
+/// A set of characters and strings which share a particular property value.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, Clone, PartialEq)]
+#[icu_provider::data_struct(marker(
+ GeneralCategoryMaskNameToValueV1Marker,
+ "propnames/from/gcm@1",
+ singleton,
+))]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider::names),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[yoke(prove_covariance_manually)]
+pub struct PropertyValueNameToEnumMapV1<'data> {
+ /// A map from names to their value discriminant
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub map: ZeroMap<'data, NormalizedPropertyNameStr, u16>,
+}
+
+/// A mapping of property values to their names. A single instance of this map will only cover
+/// either long or short names, determined whilst loading data.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, Clone, PartialEq)]
+#[icu_provider::data_struct]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider::names),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[yoke(prove_covariance_manually)]
+pub struct PropertyEnumToValueNameSparseMapV1<'data> {
+ /// A map from the value discriminant to the names
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub map: ZeroMap<'data, u16, str>,
+}
+
+/// A mapping of property values to their names. A single instance of this map will only cover
+/// either long or short names, determined whilst loading data.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, Clone, PartialEq)]
+#[icu_provider::data_struct]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider::names),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[yoke(prove_covariance_manually)]
+pub struct PropertyEnumToValueNameLinearMapV1<'data> {
+ /// A map from the value discriminant (the index) to the names, for mostly
+ /// contiguous data. Empty strings count as missing.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub map: VarZeroVec<'data, str>,
+}
+
+/// A mapping of property values to their names. A single instance of this map will only cover
+/// either long or short names, determined whilst loading data.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, Clone, PartialEq)]
+#[icu_provider::data_struct]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_properties::provider::names),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+#[yoke(prove_covariance_manually)]
+pub struct PropertyEnumToValueNameLinearTiny4MapV1<'data> {
+ /// A map from the value discriminant (the index) to the names, for mostly
+ /// contiguous data. Empty strings count as missing.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub map: ZeroVec<'data, TinyStr4>,
+}
diff --git a/third_party/rust/icu_properties/src/runtime.rs b/third_party/rust/icu_properties/src/runtime.rs
new file mode 100644
index 0000000000..79307dd6f1
--- /dev/null
+++ b/third_party/rust/icu_properties/src/runtime.rs
@@ -0,0 +1,360 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you
+//! have a use case for this!
+//!
+//! This module contains utilities for working with properties where the specific property in use
+//! is not known at compile time.
+//!
+//! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working
+//! with properties at runtime tailored for the use case of ECMA262-compatible regex engines.
+
+#[cfg(doc)]
+use crate::{maps, script, GeneralCategory, GeneralCategoryGroup, Script};
+
+/// This type can represent any Unicode property.
+///
+/// This is intended to be used in situations where the exact unicode property needed is
+/// only known at runtime, for example in regex engines.
+///
+/// The values are intended to be identical to ICU4C's UProperty enum
+#[allow(clippy::exhaustive_structs)] // newtype
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub struct UnicodeProperty(pub u32);
+
+#[allow(non_upper_case_globals)]
+#[allow(unused)] // experimental, may be made public later
+impl UnicodeProperty {
+ /// Binary property `Alphabetic`
+ pub const Alphabetic: Self = UnicodeProperty(0);
+ /// Binary property `ASCII_Hex_Digit`
+ pub const AsciiHexDigit: Self = UnicodeProperty(1);
+ /// Binary property `Bidi_Control`
+ pub const BidiControl: Self = UnicodeProperty(2);
+ /// Binary property `Bidi_Mirrored`
+ pub const BidiMirrored: Self = UnicodeProperty(3);
+ /// Binary property `Dash`
+ pub const Dash: Self = UnicodeProperty(4);
+ /// Binary property `Default_Ignorable_Code_Point`
+ pub const DefaultIgnorableCodePoint: Self = UnicodeProperty(5);
+ /// Binary property `Deprecated`
+ pub const Deprecated: Self = UnicodeProperty(6);
+ /// Binary property `Diacritic`
+ pub const Diacritic: Self = UnicodeProperty(7);
+ /// Binary property `Extender`
+ pub const Extender: Self = UnicodeProperty(8);
+ /// Binary property `Full_Composition_Exclusion`
+ pub const FullCompositionExclusion: Self = UnicodeProperty(9);
+ /// Binary property `Grapheme_Base`
+ pub const GraphemeBase: Self = UnicodeProperty(10);
+ /// Binary property `Grapheme_Extend`
+ pub const GraphemeExtend: Self = UnicodeProperty(11);
+ /// Binary property `Grapheme_Link`
+ pub const GraphemeLink: Self = UnicodeProperty(12);
+ /// Binary property `Hex_Digit`
+ pub const HexDigit: Self = UnicodeProperty(13);
+ /// Binary property `Hyphen`
+ pub const Hyphen: Self = UnicodeProperty(14);
+ /// Binary property `ID_Continue`
+ pub const IdContinue: Self = UnicodeProperty(15);
+ /// Binary property `ID_Start`
+ pub const IdStart: Self = UnicodeProperty(16);
+ /// Binary property `Ideographic`
+ pub const Ideographic: Self = UnicodeProperty(17);
+ /// Binary property `IDS_Binary_Operator`
+ pub const IdsBinaryOperator: Self = UnicodeProperty(18);
+ /// Binary property `IDS_Trinary_Operator`
+ pub const IdsTrinaryOperator: Self = UnicodeProperty(19);
+ /// Binary property `Join_Control`
+ pub const JoinControl: Self = UnicodeProperty(20);
+ /// Binary property `Logical_Order_Exception`
+ pub const LogicalOrderException: Self = UnicodeProperty(21);
+ /// Binary property `Lowercase`
+ pub const Lowercase: Self = UnicodeProperty(22);
+ /// Binary property `Math`
+ pub const Math: Self = UnicodeProperty(23);
+ /// Binary property `Noncharacter_Code_Point`
+ pub const NoncharacterCodePoint: Self = UnicodeProperty(24);
+ /// Binary property `Quotation_Mark`
+ pub const QuotationMark: Self = UnicodeProperty(25);
+ /// Binary property `Radical`
+ pub const Radical: Self = UnicodeProperty(26);
+ /// Binary property `Soft_Dotted`
+ pub const SoftDotted: Self = UnicodeProperty(27);
+ /// Binary property `Terminal_Punctuation`
+ pub const TerminalPunctuation: Self = UnicodeProperty(28);
+ /// Binary property `Unified_Ideograph`
+ pub const UnifiedIdeograph: Self = UnicodeProperty(29);
+ /// Binary property `Uppercase`
+ pub const Uppercase: Self = UnicodeProperty(30);
+ /// Binary property `White_Space`
+ pub const WhiteSpace: Self = UnicodeProperty(31);
+ /// Binary property `XID_Continue`
+ pub const XidContinue: Self = UnicodeProperty(32);
+ /// Binary property `XID_Start`
+ pub const XidStart: Self = UnicodeProperty(33);
+ /// Binary property `Case_Sensitive`
+ pub const CaseSensitive: Self = UnicodeProperty(34);
+ /// Binary property `Sentence_Terminal`
+ pub const SentenceTerminal: Self = UnicodeProperty(35);
+ /// Binary property `Variation_Selector`
+ pub const VariationSelector: Self = UnicodeProperty(36);
+ /// Binary property `NFD_Inert`
+ pub const NfdInert: Self = UnicodeProperty(37);
+ /// Binary property `NFKD_Inert`
+ pub const NfkdInert: Self = UnicodeProperty(38);
+ /// Binary property `NFC_Inert`
+ pub const NfcInert: Self = UnicodeProperty(39);
+ /// Binary property `NFKC_Inert`
+ pub const NfkcInert: Self = UnicodeProperty(40);
+ /// Binary property `Segment_Starter`
+ pub const SegmentStarter: Self = UnicodeProperty(41);
+ /// Binary property `Pattern_Syntax`
+ pub const PatternSyntax: Self = UnicodeProperty(42);
+ /// Binary property `Pattern_White_Space`
+ pub const PatternWhiteSpace: Self = UnicodeProperty(43);
+ /// Binary property `alnum`
+ pub const Alnum: Self = UnicodeProperty(44);
+ /// Binary property `blank`
+ pub const Blank: Self = UnicodeProperty(45);
+ /// Binary property `graph`
+ pub const Graph: Self = UnicodeProperty(46);
+ /// Binary property `print`
+ pub const Print: Self = UnicodeProperty(47);
+ /// Binary property `xdigit`
+ pub const XDigit: Self = UnicodeProperty(48);
+ /// Binary property `Cased`
+ pub const Cased: Self = UnicodeProperty(49);
+ /// Binary property `Case_Ignorable`
+ pub const CaseIgnorable: Self = UnicodeProperty(50);
+ /// Binary property `Changes_When_Lowercased`
+ pub const ChangesWhenLowercased: Self = UnicodeProperty(51);
+ /// Binary property `Changes_When_Uppercased`
+ pub const ChangesWhenUppercased: Self = UnicodeProperty(52);
+ /// Binary property `Changes_When_Titlecased`
+ pub const ChangesWhenTitlecased: Self = UnicodeProperty(53);
+ /// Binary property `Changes_When_Casefolded`
+ pub const ChangesWhenCasefolded: Self = UnicodeProperty(54);
+ /// Binary property `Changes_When_Casemapped`
+ pub const ChangesWhenCasemapped: Self = UnicodeProperty(55);
+ /// Binary property `Changes_When_NFKC_Casefolded`
+ pub const ChangesWhenNfkcCasefolded: Self = UnicodeProperty(56);
+ /// Binary property `Emoji`
+ pub const Emoji: Self = UnicodeProperty(57);
+ /// Binary property `Emoji_Presentation`
+ pub const EmojiPresentation: Self = UnicodeProperty(58);
+ /// Binary property `Emoji_Modifier`
+ pub const EmojiModifier: Self = UnicodeProperty(59);
+ /// Binary property `Emoji_Modifier_Base`
+ pub const EmojiModifierBase: Self = UnicodeProperty(60);
+ /// Binary property `Emoji_Component`
+ pub const EmojiComponent: Self = UnicodeProperty(61);
+ /// Binary property `Regional_Indicator`
+ pub const RegionalIndicator: Self = UnicodeProperty(62);
+ /// Binary property `Prepended_Concatenation_Mark`
+ pub const PrependedConcatenationMark: Self = UnicodeProperty(63);
+ /// Binary property `Extended_Pictographic`
+ pub const ExtendedPictographic: Self = UnicodeProperty(64);
+ /// Binary property `Basic_Emoji`
+ pub const BasicEmoji: Self = UnicodeProperty(65);
+ /// Binary property `Emoji_Keycap_Sequence`
+ pub const EmojiKeycapSequence: Self = UnicodeProperty(66);
+ /// Binary property `RGI_Emoji_Modifier_Sequence`
+ pub const RgiEmojiModifierSequence: Self = UnicodeProperty(67);
+ /// Binary property `RGI_Emoji_Flag_Sequence`
+ pub const RgiEmojiFlagSequence: Self = UnicodeProperty(68);
+ /// Binary property `RGI_Emoji_Tag_Sequence`
+ pub const RgiEmojiTagSequence: Self = UnicodeProperty(69);
+ /// Binary property `RGI_Emoji_ZWJ_Sequence`
+ pub const RgiEmojiZWJSequence: Self = UnicodeProperty(70);
+ /// Binary property `RGI_Emoji`
+ pub const RgiEmoji: Self = UnicodeProperty(71);
+
+ const BINARY_MAX: Self = Self::RgiEmoji;
+
+ /// Enumerated property `Bidi_Class`
+ pub const BidiClass: Self = UnicodeProperty(0x1000);
+ /// Enumerated property `Block`
+ pub const Block: Self = UnicodeProperty(0x1001);
+ /// Enumerated property `Canonical_Combining_Class`
+ pub const CombiningClass: Self = UnicodeProperty(0x1002);
+ /// Enumerated property `Decomposition_Type`
+ pub const DecompositionType: Self = UnicodeProperty(0x1003);
+ /// Enumerated property `East_Asian_Width`
+ pub const EastAsianWidth: Self = UnicodeProperty(0x1004);
+ /// Enumerated property `General_Category`
+ pub const GeneralCategory: Self = UnicodeProperty(0x1005);
+ /// Enumerated property `Joining_Group`
+ pub const JoiningGroup: Self = UnicodeProperty(0x1006);
+ /// Enumerated property `Joining_Type`
+ pub const JoiningType: Self = UnicodeProperty(0x1007);
+ /// Enumerated property `Line_Break`
+ pub const LineBreak: Self = UnicodeProperty(0x1008);
+ /// Enumerated property `Numeric_Type`
+ pub const NumericType: Self = UnicodeProperty(0x1009);
+ /// Enumerated property `Script`
+ pub const Script: Self = UnicodeProperty(0x100A);
+ /// Enumerated property `Hangul_Syllable_Type`
+ pub const HangulSyllableType: Self = UnicodeProperty(0x100B);
+ /// Enumerated property `NFD_Quick_Check`
+ pub const NFDQuickCheck: Self = UnicodeProperty(0x100C);
+ /// Enumerated property `NFKD_Quick_Check`
+ pub const NFKDQuickCheck: Self = UnicodeProperty(0x100D);
+ /// Enumerated property `NFC_Quick_Check`
+ pub const NFCQuickCheck: Self = UnicodeProperty(0x100E);
+ /// Enumerated property `NFKC_Quick_Check`
+ pub const NFKCQuickCheck: Self = UnicodeProperty(0x100F);
+ /// Enumerated property `Lead_Canonical_Combining_Class`
+ pub const LeadCanonicalCombiningClass: Self = UnicodeProperty(0x1010);
+ /// Enumerated property `Trail_Canonical_Combining_Class`
+ pub const TrailCanonicalCombiningClass: Self = UnicodeProperty(0x1011);
+ /// Enumerated property `Grapheme_Cluster_Break`
+ pub const GraphemeClusterBreak: Self = UnicodeProperty(0x1012);
+ /// Enumerated property `Sentence_Break`
+ pub const SentenceBreak: Self = UnicodeProperty(0x1013);
+ /// Enumerated property `Word_Break`
+ pub const WordBreak: Self = UnicodeProperty(0x1014);
+ /// Enumerated property `Bidi_Paired_Bracket_Type`
+ pub const BidiPairedBracketType: Self = UnicodeProperty(0x1015);
+ /// Enumerated property `Indic_Positional_Category`
+ pub const IndicPositionalCategory: Self = UnicodeProperty(0x1016);
+ /// Enumerated property `Indic_Syllabic_Category`
+ pub const IndicSyllabicCategory: Self = UnicodeProperty(0x1017);
+ /// Enumerated property `Vertical_Orientation`
+ pub const VerticalOrientation: Self = UnicodeProperty(0x1018);
+
+ const ENUMERATED_MAX: Self = Self::VerticalOrientation;
+
+ /// Mask property `General_Category_Mask`
+ pub const GeneralCategoryMask: Self = UnicodeProperty(0x2000);
+
+ /// Double property `Numeric_Value`
+ pub const NumericValue: Self = UnicodeProperty(0x3000);
+
+ /// String property `Age`
+ pub const Age: Self = UnicodeProperty(0x4000);
+ /// String property `Bidi_Mirroring_Glyph`
+ pub const BidiMirroringGlyph: Self = UnicodeProperty(0x4001);
+ /// String property `Case_Folding`
+ pub const CaseFolding: Self = UnicodeProperty(0x4002);
+ /// String property `ISO_Comment`
+ pub const ISOComment: Self = UnicodeProperty(0x4003);
+ /// String property `Lowercase_Mapping`
+ pub const LowercaseMapping: Self = UnicodeProperty(0x4004);
+ /// String property `Name`
+ pub const Name: Self = UnicodeProperty(0x4005);
+ /// String property `Simple_Case_Folding`
+ pub const SimpleCaseFolding: Self = UnicodeProperty(0x4006);
+ /// String property `Simple_Lowercase_Mapping`
+ pub const SimpleLowercaseMapping: Self = UnicodeProperty(0x4007);
+ /// String property `Simple_Titlecase_Mapping`
+ pub const SimpleTitlecaseMapping: Self = UnicodeProperty(0x4008);
+ /// String property `Simple_Uppercase_Mapping`
+ pub const SimpleUppercaseMapping: Self = UnicodeProperty(0x4009);
+ /// String property `Titlecase_Mapping`
+ pub const TitlecaseMapping: Self = UnicodeProperty(0x400A);
+ /// String property `Unicode_1_Name`
+ pub const Unicode1_Name: Self = UnicodeProperty(0x400B);
+ /// String property `Uppercase_Mapping`
+ pub const UppercaseMapping: Self = UnicodeProperty(0x400C);
+ /// String property `Bidi_Paired_Bracket`
+ pub const BidiPairedBracket: Self = UnicodeProperty(0x400D);
+
+ const STRING_MAX: Self = Self::BidiPairedBracket;
+
+ /// Misc property `Script_Extensions`
+ pub const ScriptExtensions: Self = UnicodeProperty(0x7000);
+}
+
+#[allow(unused)] // experimental, may be made public later
+impl UnicodeProperty {
+ /// Given a property name (long, short, or alias), returns the corresponding [`UnicodeProperty`]
+ /// value for it provided it belongs to the [subset relevant for ECMA262 regexes][subset]
+ ///
+ /// Returns none if the name does not match any of the names in this subset. Performs
+ /// strict matching of names.
+ ///
+ /// If using this to implement an ECMA262-compliant regex engine, please note these caveats:
+ ///
+ /// - This only returns binary and enumerated properties, as well as [`Self::ScriptExtensions`].
+ /// Lookup can be performed sufficiently with [`Self::load_ecma262_binary_property_unstable()`],
+ /// [`maps::load_general_category()`], [`maps::load_script()`] and [`script::load_script_with_extensions_unstable()`].
+ /// - This does not handle the `Any`, `Assigned`, or `ASCII` pseudoproperties, since they are not
+ /// defined as properties.
+ /// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]`
+ /// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`).
+ /// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]`
+ /// - ECMA262 regexes transparently allow `General_Category_Mask` values for `GeneralCategory`.
+ /// This method does not return [`Self::GeneralCategoryMask`], and instead relies on the caller to use mask-related lookup
+ /// functions where necessary.
+ /// - ECMA262 regexes allow treating `General_Category` (and `gcm`) values as binary properties,
+ /// e.g. you can do things like `\p{Lu}` as shortform for `\p{gc=Lu}`. This method does not do so
+ /// since these are property values, not properties, but you can use
+ /// [`GeneralCategory::get_name_to_enum_mapper()`] or [`GeneralCategoryGroup::get_name_to_enum_mapper()`]
+ /// to handle this.
+ ///
+ ///
+ /// [subset]: https://tc39.es/ecma262/#table-nonbinary-unicode-properties
+ pub fn parse_ecma262_name(name: &str) -> Option<Self> {
+ let prop = match name {
+ "General_Category" | "gc" => Self::GeneralCategory,
+ "Script" | "sc" => Self::Script,
+ "Script_Extensions" | "scx" => Self::ScriptExtensions,
+ "ASCII_Hex_Digit" | "AHex" => Self::AsciiHexDigit,
+ "Alphabetic" | "Alpha" => Self::Alphabetic,
+ "Bidi_Control" | "Bidi_C" => Self::BidiControl,
+ "Bidi_Mirrored" | "Bidi_M" => Self::BidiMirrored,
+ "Case_Ignorable" | "CI" => Self::CaseIgnorable,
+ "Cased" => Self::Cased,
+ "Changes_When_Casefolded" | "CWCF" => Self::ChangesWhenCasefolded,
+ "Changes_When_Casemapped" | "CWCM" => Self::ChangesWhenCasemapped,
+ "Changes_When_Lowercased" | "CWL" => Self::ChangesWhenLowercased,
+ "Changes_When_NFKC_Casefolded" | "CWKCF" => Self::ChangesWhenNfkcCasefolded,
+ "Changes_When_Titlecased" | "CWT" => Self::ChangesWhenTitlecased,
+ "Changes_When_Uppercased" | "CWU" => Self::ChangesWhenUppercased,
+ "Dash" => Self::Dash,
+ "Default_Ignorable_Code_Point" | "DI" => Self::DefaultIgnorableCodePoint,
+ "Deprecated" | "Dep" => Self::Deprecated,
+ "Diacritic" | "Dia" => Self::Diacritic,
+ "Emoji" => Self::Emoji,
+ "Emoji_Component" | "EComp" => Self::EmojiComponent,
+ "Emoji_Modifier" | "EMod" => Self::EmojiModifier,
+ "Emoji_Modifier_Base" | "EBase" => Self::EmojiModifierBase,
+ "Emoji_Presentation" | "EPres" => Self::EmojiPresentation,
+ "Extended_Pictographic" | "ExtPict" => Self::ExtendedPictographic,
+ "Extender" | "Ext" => Self::Extender,
+ "Grapheme_Base" | "Gr_Base" => Self::GraphemeBase,
+ "Grapheme_Extend" | "Gr_Ext" => Self::GraphemeExtend,
+ "Hex_Digit" | "Hex" => Self::HexDigit,
+ "IDS_Binary_Operator" | "IDSB" => Self::IdsBinaryOperator,
+ "IDS_Trinary_Operator" | "IDST" => Self::IdsTrinaryOperator,
+ "ID_Continue" | "IDC" => Self::IdContinue,
+ "ID_Start" | "IDS" => Self::IdStart,
+ "Ideographic" | "Ideo" => Self::Ideographic,
+ "Join_Control" | "Join_C" => Self::JoinControl,
+ "Logical_Order_Exception" | "LOE" => Self::LogicalOrderException,
+ "Lowercase" | "Lower" => Self::Lowercase,
+ "Math" => Self::Math,
+ "Noncharacter_Code_Point" | "NChar" => Self::NoncharacterCodePoint,
+ "Pattern_Syntax" | "Pat_Syn" => Self::PatternSyntax,
+ "Pattern_White_Space" | "Pat_WS" => Self::PatternWhiteSpace,
+ "Quotation_Mark" | "QMark" => Self::QuotationMark,
+ "Radical" => Self::Radical,
+ "Regional_Indicator" | "RI" => Self::RegionalIndicator,
+ "Sentence_Terminal" | "STerm" => Self::SentenceTerminal,
+ "Soft_Dotted" | "SD" => Self::SoftDotted,
+ "Terminal_Punctuation" | "Term" => Self::TerminalPunctuation,
+ "Unified_Ideograph" | "UIdeo" => Self::UnifiedIdeograph,
+ "Uppercase" | "Upper" => Self::Uppercase,
+ "Variation_Selector" | "VS" => Self::VariationSelector,
+ "White_Space" | "space" => Self::WhiteSpace,
+ "XID_Continue" | "XIDC" => Self::XidContinue,
+ "XID_Start" | "XIDS" => Self::XidStart,
+ _ => return None,
+ };
+
+ Some(prop)
+ }
+}
diff --git a/third_party/rust/icu_properties/src/script.rs b/third_party/rust/icu_properties/src/script.rs
new file mode 100644
index 0000000000..7e2595a4c4
--- /dev/null
+++ b/third_party/rust/icu_properties/src/script.rs
@@ -0,0 +1,648 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Data and APIs for supporting both Script and Script_Extensions property
+//! values in an efficient structure.
+
+use crate::error::PropertiesError;
+use crate::props::Script;
+use crate::props::ScriptULE;
+use crate::provider::*;
+
+use core::iter::FromIterator;
+use core::ops::RangeInclusive;
+use icu_collections::codepointinvlist::CodePointInversionList;
+use icu_provider::prelude::*;
+use zerovec::{ule::AsULE, ZeroSlice};
+
+/// The number of bits at the low-end of a `ScriptWithExt` value used for
+/// storing the `Script` value (or `extensions` index).
+const SCRIPT_VAL_LENGTH: u16 = 10;
+
+/// The bit mask necessary to retrieve the `Script` value (or `extensions` index)
+/// from a `ScriptWithExt` value.
+const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1;
+
+/// An internal-use only pseudo-property that represents the values stored in
+/// the trie of the special data structure [`ScriptWithExtensionsPropertyV1`].
+///
+/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions
+/// 11..10 will indicate how to deduce the Script value and Script_Extensions,
+/// and the lower 10 bits 9..0 indicate either the Script value or the index
+/// into the `extensions` structure.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+#[cfg_attr(feature = "datagen", derive(databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_properties::script))]
+#[repr(transparent)]
+#[doc(hidden)]
+// `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsPropertyV1` constructor
+#[allow(clippy::exhaustive_structs)] // this type is stable
+pub struct ScriptWithExt(pub u16);
+
+#[allow(missing_docs)] // These constants don't need individual documentation.
+#[allow(non_upper_case_globals)]
+#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsPropertyV1` constructor
+impl ScriptWithExt {
+ pub const Unknown: ScriptWithExt = ScriptWithExt(0);
+}
+
+impl AsULE for ScriptWithExt {
+ type ULE = ScriptULE;
+
+ #[inline]
+ fn to_unaligned(self) -> Self::ULE {
+ Script(self.0).to_unaligned()
+ }
+
+ #[inline]
+ fn from_unaligned(unaligned: Self::ULE) -> Self {
+ ScriptWithExt(Script::from_unaligned(unaligned).0)
+ }
+}
+
+#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsPropertyV1` constructor
+impl ScriptWithExt {
+ /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
+ /// also indicates a Script value of [`Script::Common`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::script::ScriptWithExt;
+ ///
+ /// assert!(ScriptWithExt(0x04FF).is_common());
+ /// assert!(ScriptWithExt(0x0400).is_common());
+ ///
+ /// assert!(!ScriptWithExt(0x08FF).is_common());
+ /// assert!(!ScriptWithExt(0x0800).is_common());
+ ///
+ /// assert!(!ScriptWithExt(0x0CFF).is_common());
+ /// assert!(!ScriptWithExt(0x0C00).is_common());
+ ///
+ /// assert!(!ScriptWithExt(0xFF).is_common());
+ /// assert!(!ScriptWithExt(0x0).is_common());
+ /// ```
+ pub fn is_common(&self) -> bool {
+ self.0 >> SCRIPT_VAL_LENGTH == 1
+ }
+
+ /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
+ /// also indicates a Script value of [`Script::Inherited`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::script::ScriptWithExt;
+ ///
+ /// assert!(!ScriptWithExt(0x04FF).is_inherited());
+ /// assert!(!ScriptWithExt(0x0400).is_inherited());
+ ///
+ /// assert!(ScriptWithExt(0x08FF).is_inherited());
+ /// assert!(ScriptWithExt(0x0800).is_inherited());
+ ///
+ /// assert!(!ScriptWithExt(0x0CFF).is_inherited());
+ /// assert!(!ScriptWithExt(0x0C00).is_inherited());
+ ///
+ /// assert!(!ScriptWithExt(0xFF).is_inherited());
+ /// assert!(!ScriptWithExt(0x0).is_inherited());
+ /// ```
+ pub fn is_inherited(&self) -> bool {
+ self.0 >> SCRIPT_VAL_LENGTH == 2
+ }
+
+ /// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
+ /// also indicates that the Script value is neither [`Script::Common`] nor
+ /// [`Script::Inherited`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::script::ScriptWithExt;
+ ///
+ /// assert!(!ScriptWithExt(0x04FF).is_other());
+ /// assert!(!ScriptWithExt(0x0400).is_other());
+ ///
+ /// assert!(!ScriptWithExt(0x08FF).is_other());
+ /// assert!(!ScriptWithExt(0x0800).is_other());
+ ///
+ /// assert!(ScriptWithExt(0x0CFF).is_other());
+ /// assert!(ScriptWithExt(0x0C00).is_other());
+ ///
+ /// assert!(!ScriptWithExt(0xFF).is_other());
+ /// assert!(!ScriptWithExt(0x0).is_other());
+ /// ```
+ pub fn is_other(&self) -> bool {
+ self.0 >> SCRIPT_VAL_LENGTH == 3
+ }
+
+ /// Returns whether the [`ScriptWithExt`] value has Script_Extensions.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::script::ScriptWithExt;
+ ///
+ /// assert!(ScriptWithExt(0x04FF).has_extensions());
+ /// assert!(ScriptWithExt(0x0400).has_extensions());
+ ///
+ /// assert!(ScriptWithExt(0x08FF).has_extensions());
+ /// assert!(ScriptWithExt(0x0800).has_extensions());
+ ///
+ /// assert!(ScriptWithExt(0x0CFF).has_extensions());
+ /// assert!(ScriptWithExt(0x0C00).has_extensions());
+ ///
+ /// assert!(!ScriptWithExt(0xFF).has_extensions());
+ /// assert!(!ScriptWithExt(0x0).has_extensions());
+ /// ```
+ pub fn has_extensions(&self) -> bool {
+ let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH;
+ high_order_bits > 0
+ }
+}
+
+impl From<ScriptWithExt> for u32 {
+ fn from(swe: ScriptWithExt) -> Self {
+ swe.0 as u32
+ }
+}
+
+impl From<ScriptWithExt> for Script {
+ fn from(swe: ScriptWithExt) -> Self {
+ Script(swe.0)
+ }
+}
+
+/// A struct that wraps a [`Script`] array, such as in the return value for
+/// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val).
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub struct ScriptExtensionsSet<'a> {
+ values: &'a ZeroSlice<Script>,
+}
+
+impl ScriptExtensionsSet<'_> {
+ /// Returns whether this set contains the given script.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ /// let swe = script::script_with_extensions();
+ ///
+ /// assert!(swe
+ /// .get_script_extensions_val(0x11303) // GRANTHA SIGN VISARGA
+ /// .contains(&Script::Grantha));
+ /// ```
+ pub fn contains(&self, x: &Script) -> bool {
+ ZeroSlice::binary_search(self.values, x).is_ok()
+ }
+
+ /// Gets an iterator over the elements.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ /// let swe = script::script_with_extensions();
+ ///
+ /// assert_eq!(
+ /// swe.get_script_extensions_val('௫' as u32) // U+0BEB TAMIL DIGIT FIVE
+ /// .iter()
+ /// .collect::<Vec<Script>>(),
+ /// vec![Script::Tamil, Script::Grantha]
+ /// );
+ /// ```
+ pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + '_ {
+ ZeroSlice::iter(self.values)
+ }
+
+ /// For accessing this set as an array instead of an iterator
+ /// only needed for the FFI bindings; shouldn't be used directly from Rust
+ #[doc(hidden)]
+ pub fn array_len(&self) -> usize {
+ self.values.len()
+ }
+ /// For accessing this set as an array instead of an iterator
+ /// only needed for the FFI bindings; shouldn't be used directly from Rust
+ #[doc(hidden)]
+ pub fn array_get(&self, index: usize) -> Option<Script> {
+ self.values.get(index)
+ }
+}
+
+/// A wrapper around script extensions data. Can be obtained via [`load_script_with_extensions_unstable()`] and
+/// related getters.
+///
+/// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`]
+#[derive(Debug)]
+pub struct ScriptWithExtensions {
+ data: DataPayload<ScriptWithExtensionsPropertyV1Marker>,
+}
+
+/// A borrowed wrapper around script extension data, returned by
+/// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query.
+#[derive(Clone, Copy, Debug)]
+pub struct ScriptWithExtensionsBorrowed<'a> {
+ data: &'a ScriptWithExtensionsPropertyV1<'a>,
+}
+
+impl ScriptWithExtensions {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> {
+ ScriptWithExtensionsBorrowed {
+ data: self.data.get(),
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead
+ pub fn from_data(data: DataPayload<ScriptWithExtensionsPropertyV1Marker>) -> Self {
+ Self { data }
+ }
+}
+
+impl<'a> ScriptWithExtensionsBorrowed<'a> {
+ /// Returns the `Script` property value for this code point.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ ///
+ /// let swe = script::script_with_extensions();
+ ///
+ /// // U+0640 ARABIC TATWEEL
+ /// assert_eq!(swe.get_script_val(0x0640), Script::Common); // main Script value
+ /// assert_ne!(swe.get_script_val(0x0640), Script::Arabic);
+ /// assert_ne!(swe.get_script_val(0x0640), Script::Syriac);
+ /// assert_ne!(swe.get_script_val(0x0640), Script::Thaana);
+ ///
+ /// // U+0650 ARABIC KASRA
+ /// assert_eq!(swe.get_script_val(0x0650), Script::Inherited); // main Script value
+ /// assert_ne!(swe.get_script_val(0x0650), Script::Arabic);
+ /// assert_ne!(swe.get_script_val(0x0650), Script::Syriac);
+ /// assert_ne!(swe.get_script_val(0x0650), Script::Thaana);
+ ///
+ /// // U+0660 ARABIC-INDIC DIGIT ZERO
+ /// assert_ne!(swe.get_script_val(0x0660), Script::Common);
+ /// assert_eq!(swe.get_script_val(0x0660), Script::Arabic); // main Script value
+ /// assert_ne!(swe.get_script_val(0x0660), Script::Syriac);
+ /// assert_ne!(swe.get_script_val(0x0660), Script::Thaana);
+ ///
+ /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
+ /// assert_ne!(swe.get_script_val(0xFDF2), Script::Common);
+ /// assert_eq!(swe.get_script_val(0xFDF2), Script::Arabic); // main Script value
+ /// assert_ne!(swe.get_script_val(0xFDF2), Script::Syriac);
+ /// assert_ne!(swe.get_script_val(0xFDF2), Script::Thaana);
+ /// ```
+ pub fn get_script_val(self, code_point: u32) -> Script {
+ let sc_with_ext = self.data.trie.get32(code_point);
+
+ if sc_with_ext.is_other() {
+ let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
+ let scx_val = self.data.extensions.get(ext_idx as usize);
+ let scx_first_sc = scx_val.and_then(|scx| scx.get(0));
+
+ let default_sc_val = Script::Unknown;
+
+ scx_first_sc.unwrap_or(default_sc_val)
+ } else if sc_with_ext.is_common() {
+ Script::Common
+ } else if sc_with_ext.is_inherited() {
+ Script::Inherited
+ } else {
+ let script_val = sc_with_ext.0;
+ Script(script_val)
+ }
+ }
+ // Returns the Script_Extensions value for a code_point when the trie value
+ // is already known.
+ // This private helper method exists to prevent code duplication in callers like
+ // `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`.
+ fn get_scx_val_using_trie_val(
+ self,
+ sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE,
+ ) -> &'a ZeroSlice<Script> {
+ let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule);
+ if sc_with_ext.is_other() {
+ let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
+ let ext_subarray = self.data.extensions.get(ext_idx as usize);
+ // In the OTHER case, where the 2 higher-order bits of the
+ // `ScriptWithExt` value in the trie doesn't indicate the Script value,
+ // the Script value is copied/inserted into the first position of the
+ // `extensions` array. So we must remove it to return the actual scx array val.
+ let scx_slice = ext_subarray
+ .and_then(|zslice| zslice.as_ule_slice().get(1..))
+ .unwrap_or_default();
+ ZeroSlice::from_ule_slice(scx_slice)
+ } else if sc_with_ext.is_common() || sc_with_ext.is_inherited() {
+ let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
+ let scx_val = self.data.extensions.get(ext_idx as usize);
+ scx_val.unwrap_or_default()
+ } else {
+ // Note: `Script` and `ScriptWithExt` are both represented as the same
+ // u16 value when the `ScriptWithExt` has no higher-order bits set.
+ let script_ule_slice = core::slice::from_ref(sc_with_ext_ule);
+ ZeroSlice::from_ule_slice(script_ule_slice)
+ }
+ }
+ /// Return the `Script_Extensions` property value for this code point.
+ ///
+ /// If `code_point` has Script_Extensions, then return the Script codes in
+ /// the Script_Extensions. In this case, the Script property value
+ /// (normally Common or Inherited) is not included in the [`ScriptExtensionsSet`].
+ ///
+ /// If c does not have Script_Extensions, then the one Script code is put
+ /// into the [`ScriptExtensionsSet`] and also returned.
+ ///
+ /// If c is not a valid code point, then return an empty [`ScriptExtensionsSet`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ ///
+ /// let swe = script::script_with_extensions();
+ ///
+ /// assert_eq!(
+ /// swe.get_script_extensions_val('𐓐' as u32) // U+104D0 OSAGE CAPITAL LETTER KHA
+ /// .iter()
+ /// .collect::<Vec<Script>>(),
+ /// vec![Script::Osage]
+ /// );
+ /// assert_eq!(
+ /// swe.get_script_extensions_val('🥳' as u32) // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
+ /// .iter()
+ /// .collect::<Vec<Script>>(),
+ /// vec![Script::Common]
+ /// );
+ /// assert_eq!(
+ /// swe.get_script_extensions_val(0x200D) // ZERO WIDTH JOINER
+ /// .iter()
+ /// .collect::<Vec<Script>>(),
+ /// vec![Script::Inherited]
+ /// );
+ /// assert_eq!(
+ /// swe.get_script_extensions_val('௫' as u32) // U+0BEB TAMIL DIGIT FIVE
+ /// .iter()
+ /// .collect::<Vec<Script>>(),
+ /// vec![Script::Tamil, Script::Grantha]
+ /// );
+ /// ```
+ pub fn get_script_extensions_val(self, code_point: u32) -> ScriptExtensionsSet<'a> {
+ let sc_with_ext_ule = self.data.trie.get32_ule(code_point);
+
+ ScriptExtensionsSet {
+ values: match sc_with_ext_ule {
+ Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref),
+ None => ZeroSlice::from_ule_slice(&[]),
+ },
+ }
+ }
+
+ /// Returns whether `script` is contained in the Script_Extensions
+ /// property value if the code_point has Script_Extensions, otherwise
+ /// if the code point does not have Script_Extensions then returns
+ /// whether the Script property value matches.
+ ///
+ /// Some characters are commonly used in multiple scripts. For more information,
+ /// see UAX #24: <http://www.unicode.org/reports/tr24/>.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ ///
+ /// let swe = script::script_with_extensions();
+ ///
+ /// // U+0650 ARABIC KASRA
+ /// assert!(!swe.has_script(0x0650, Script::Inherited)); // main Script value
+ /// assert!(swe.has_script(0x0650, Script::Arabic));
+ /// assert!(swe.has_script(0x0650, Script::Syriac));
+ /// assert!(!swe.has_script(0x0650, Script::Thaana));
+ ///
+ /// // U+0660 ARABIC-INDIC DIGIT ZERO
+ /// assert!(!swe.has_script(0x0660, Script::Common)); // main Script value
+ /// assert!(swe.has_script(0x0660, Script::Arabic));
+ /// assert!(!swe.has_script(0x0660, Script::Syriac));
+ /// assert!(swe.has_script(0x0660, Script::Thaana));
+ ///
+ /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
+ /// assert!(!swe.has_script(0xFDF2, Script::Common));
+ /// assert!(swe.has_script(0xFDF2, Script::Arabic)); // main Script value
+ /// assert!(!swe.has_script(0xFDF2, Script::Syriac));
+ /// assert!(swe.has_script(0xFDF2, Script::Thaana));
+ /// ```
+ pub fn has_script(self, code_point: u32, script: Script) -> bool {
+ let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) {
+ scwe_ule
+ } else {
+ return false;
+ };
+ let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule);
+
+ if !sc_with_ext.has_extensions() {
+ let script_val = sc_with_ext.0;
+ script == Script(script_val)
+ } else {
+ let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule);
+ let script_find = scx_val.iter().find(|&sc| sc == script);
+ script_find.is_some()
+ }
+ }
+
+ /// Returns all of the matching `CodePointMapRange`s for the given [`Script`]
+ /// in which `has_script` will return true for all of the contained code points.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ ///
+ /// let swe = script::script_with_extensions();
+ ///
+ /// let syriac_script_extensions_ranges = swe.get_script_extensions_ranges(Script::Syriac);
+ ///
+ /// let exp_ranges = vec![
+ /// 0x060C..=0x060C, // ARABIC COMMA
+ /// 0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK
+ /// 0x061F..=0x061F, // ARABIC QUESTION MARK
+ /// 0x0640..=0x0640, // ARABIC TATWEEL
+ /// 0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW
+ /// 0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF
+ /// 0x0700..=0x070D, // Syriac block begins at U+0700
+ /// 0x070F..=0x074A, // Syriac block
+ /// 0x074D..=0x074F, // Syriac block ends at U+074F
+ /// 0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F
+ /// 0x1DF8..=0x1DF8, // U+1DF8 COMBINING DOT ABOVE LEFT
+ /// 0x1DFA..=0x1DFA, // U+1DFA COMBINING DOT BELOW LEFT
+ /// ];
+ /// let mut exp_ranges_iter = exp_ranges.iter();
+ ///
+ /// for act_range in syriac_script_extensions_ranges {
+ /// let exp_range = exp_ranges_iter
+ /// .next()
+ /// .expect("There are too many ranges returned by get_script_extensions_ranges()");
+ /// assert_eq!(act_range.start(), exp_range.start());
+ /// assert_eq!(act_range.end(), exp_range.end());
+ /// }
+ /// assert!(
+ /// exp_ranges_iter.next().is_none(),
+ /// "There are too few ranges returned by get_script_extensions_ranges()"
+ /// );
+ /// ```
+ pub fn get_script_extensions_ranges(
+ self,
+ script: Script,
+ ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
+ self.data
+ .trie
+ .iter_ranges_mapped(move |value| {
+ let sc_with_ext = ScriptWithExt(value.0);
+ if sc_with_ext.has_extensions() {
+ self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned())
+ .iter()
+ .any(|sc| sc == script)
+ } else {
+ script == sc_with_ext.into()
+ }
+ })
+ .filter(|v| v.value)
+ .map(|v| v.range)
+ }
+
+ /// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all
+ /// code points for which `has_script` will return true.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::properties::{script, Script};
+ ///
+ /// let swe = script::script_with_extensions();
+ ///
+ /// let syriac = swe.get_script_extensions_set(Script::Syriac);
+ ///
+ /// assert!(!syriac.contains32(0x061E)); // ARABIC TRIPLE DOT PUNCTUATION MARK
+ /// assert!(syriac.contains32(0x061F)); // ARABIC QUESTION MARK
+ /// assert!(!syriac.contains32(0x0620)); // ARABIC LETTER KASHMIRI YEH
+ ///
+ /// assert!(syriac.contains32(0x0700)); // SYRIAC END OF PARAGRAPH
+ /// assert!(syriac.contains32(0x074A)); // SYRIAC BARREKH
+ /// assert!(!syriac.contains32(0x074B)); // unassigned
+ /// assert!(syriac.contains32(0x074F)); // SYRIAC LETTER SOGDIAN FE
+ /// assert!(!syriac.contains32(0x0750)); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW
+ ///
+ /// assert!(syriac.contains32(0x1DF8)); // COMBINING DOT ABOVE LEFT
+ /// assert!(!syriac.contains32(0x1DF9)); // COMBINING WIDE INVERTED BRIDGE BELOW
+ /// assert!(syriac.contains32(0x1DFA)); // COMBINING DOT BELOW LEFT
+ /// assert!(!syriac.contains32(0x1DFB)); // COMBINING DELETION MARK
+ /// ```
+ pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> {
+ CodePointInversionList::from_iter(self.get_script_extensions_ranges(script))
+ }
+}
+
+impl ScriptWithExtensionsBorrowed<'static> {
+ /// Cheaply converts a `ScriptWithExtensionsBorrowed<'static>` into a `ScriptWithExtensions`.
+ pub const fn static_to_owned(self) -> ScriptWithExtensions {
+ ScriptWithExtensions {
+ data: DataPayload::from_static_ref(self.data),
+ }
+ }
+}
+
+/// Returns a [`ScriptWithExtensionsBorrowed`] struct that represents the data for the Script
+/// and Script_Extensions properties.
+///
+/// ✨ *Enabled with the `compiled_data` Cargo feature.*
+///
+/// [📚 Help choosing a constructor](icu_provider::constructors)
+///
+/// # Examples
+///
+/// ```
+/// use icu::properties::{script, Script};
+/// let swe = script::script_with_extensions();
+///
+/// // get the `Script` property value
+/// assert_eq!(swe.get_script_val(0x0640), Script::Common); // U+0640 ARABIC TATWEEL
+/// assert_eq!(swe.get_script_val(0x0650), Script::Inherited); // U+0650 ARABIC KASRA
+/// assert_eq!(swe.get_script_val(0x0660), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO
+/// assert_eq!(swe.get_script_val(0xFDF2), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
+///
+/// // get the `Script_Extensions` property value
+/// assert_eq!(
+/// swe.get_script_extensions_val(0x0640) // U+0640 ARABIC TATWEEL
+/// .iter().collect::<Vec<Script>>(),
+/// vec![Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean,
+/// Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian,
+/// Script::OldUyghur]
+/// );
+/// assert_eq!(
+/// swe.get_script_extensions_val('🥳' as u32) // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
+/// .iter().collect::<Vec<Script>>(),
+/// vec![Script::Common]
+/// );
+/// assert_eq!(
+/// swe.get_script_extensions_val(0x200D) // ZERO WIDTH JOINER
+/// .iter().collect::<Vec<Script>>(),
+/// vec![Script::Inherited]
+/// );
+/// assert_eq!(
+/// swe.get_script_extensions_val('௫' as u32) // U+0BEB TAMIL DIGIT FIVE
+/// .iter().collect::<Vec<Script>>(),
+/// vec![Script::Tamil, Script::Grantha]
+/// );
+///
+/// // check containment of a `Script` value in the `Script_Extensions` value
+/// // U+0650 ARABIC KASRA
+/// assert!(!swe.has_script(0x0650, Script::Inherited)); // main Script value
+/// assert!(swe.has_script(0x0650, Script::Arabic));
+/// assert!(swe.has_script(0x0650, Script::Syriac));
+/// assert!(!swe.has_script(0x0650, Script::Thaana));
+///
+/// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value
+/// let syriac = swe.get_script_extensions_set(Script::Syriac);
+/// assert!(syriac.contains32(0x0650)); // ARABIC KASRA
+/// assert!(!syriac.contains32(0x0660)); // ARABIC-INDIC DIGIT ZERO
+/// assert!(!syriac.contains32(0xFDF2)); // ARABIC LIGATURE ALLAH ISOLATED FORM
+/// assert!(syriac.contains32(0x0700)); // SYRIAC END OF PARAGRAPH
+/// assert!(syriac.contains32(0x074A)); // SYRIAC BARREKH
+/// ```
+#[cfg(feature = "compiled_data")]
+pub const fn script_with_extensions() -> ScriptWithExtensionsBorrowed<'static> {
+ ScriptWithExtensionsBorrowed {
+ data: crate::provider::Baked::SINGLETON_PROPS_SCX_V1,
+ }
+}
+
+icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ options: skip,
+ result: Result<ScriptWithExtensions, PropertiesError>,
+ #[cfg(skip)]
+ functions: [
+ script_with_extensions,
+ load_script_with_extensions_with_any_provider,
+ load_script_with_extensions_with_buffer_provider,
+ load_script_with_extensions_unstable,
+ ]
+);
+
+#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, script_with_extensions)]
+pub fn load_script_with_extensions_unstable(
+ provider: &(impl DataProvider<ScriptWithExtensionsPropertyV1Marker> + ?Sized),
+) -> Result<ScriptWithExtensions, PropertiesError> {
+ Ok(ScriptWithExtensions::from_data(
+ provider
+ .load(Default::default())
+ .and_then(DataResponse::take_payload)?,
+ ))
+}
diff --git a/third_party/rust/icu_properties/src/sets.rs b/third_party/rust/icu_properties/src/sets.rs
new file mode 100644
index 0000000000..3fd229f72c
--- /dev/null
+++ b/third_party/rust/icu_properties/src/sets.rs
@@ -0,0 +1,2381 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! The functions in this module return a [`CodePointSetData`] containing
+//! the set of characters with a particular Unicode property.
+//!
+//! The descriptions of most properties are taken from [`TR44`], the documentation for the
+//! Unicode Character Database. Some properties are instead defined in [`TR18`], the
+//! documentation for Unicode regular expressions. In particular, Annex C of this document
+//! defines properties for POSIX compatibility.
+//!
+//! [`CodePointSetData`]: crate::sets::CodePointSetData
+//! [`TR44`]: https://www.unicode.org/reports/tr44
+//! [`TR18`]: https://www.unicode.org/reports/tr18
+
+use crate::error::PropertiesError;
+use crate::provider::*;
+use crate::*;
+use core::iter::FromIterator;
+use core::ops::RangeInclusive;
+use icu_collections::codepointinvlist::CodePointInversionList;
+use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
+use icu_provider::prelude::*;
+
+//
+// CodePointSet* structs, impls, & macros
+// (a set with only code points)
+//
+
+/// A wrapper around code point set data. It is returned by APIs that return Unicode
+/// property data in a set-like form, ex: a set of code points sharing the same
+/// value for a Unicode property. Access its data via the borrowed version,
+/// [`CodePointSetDataBorrowed`].
+#[derive(Debug)]
+pub struct CodePointSetData {
+ data: DataPayload<ErasedSetlikeMarker>,
+}
+
+/// Private marker type for CodePointSetData
+/// to work for all set properties at once
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ErasedSetlikeMarker;
+impl DataMarker for ErasedSetlikeMarker {
+ type Yokeable = PropertyCodePointSetV1<'static>;
+}
+
+impl CodePointSetData {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This owned version if returned by functions that use a runtime data provider.
+ #[inline]
+ pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
+ CodePointSetDataBorrowed {
+ set: self.data.get(),
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
+ pub fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyCodePointSetV1<'static>>,
+ {
+ Self { data: data.cast() }
+ }
+
+ /// Construct a new owned [`CodePointInversionList`]
+ pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
+ let set = PropertyCodePointSetV1::from_code_point_inversion_list(set);
+ CodePointSetData::from_data(DataPayload::<ErasedSetlikeMarker>::from_owned(set))
+ }
+
+ /// Convert this type to a [`CodePointInversionList`] as a borrowed value.
+ ///
+ /// The data backing this is extensible and supports multiple implementations.
+ /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
+ /// added, and users may select which at data generation time.
+ ///
+ /// This method returns an `Option` in order to return `None` when the backing data provider
+ /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
+ /// constraint.
+ pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
+ self.data.get().as_code_point_inversion_list()
+ }
+
+ /// Convert this type to a [`CodePointInversionList`], borrowing if possible,
+ /// otherwise allocating a new [`CodePointInversionList`].
+ ///
+ /// The data backing this is extensible and supports multiple implementations.
+ /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
+ /// added, and users may select which at data generation time.
+ ///
+ /// The performance of the conversion to this specific return type will vary
+ /// depending on the data structure that is backing `self`.
+ pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
+ self.data.get().to_code_point_inversion_list()
+ }
+}
+
+/// A borrowed wrapper around code point set data, returned by
+/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
+#[derive(Clone, Copy, Debug)]
+pub struct CodePointSetDataBorrowed<'a> {
+ set: &'a PropertyCodePointSetV1<'a>,
+}
+
+impl CodePointSetDataBorrowed<'static> {
+ /// Cheaply converts a `CodePointSetDataBorrowed<'static>` into a `CodePointSetData`.
+ pub const fn static_to_owned(self) -> CodePointSetData {
+ CodePointSetData {
+ data: DataPayload::from_static_ref(self.set),
+ }
+ }
+}
+
+impl<'a> CodePointSetDataBorrowed<'a> {
+ /// Check if the set contains a character
+ ///
+ /// ```rust
+ /// use icu_properties::sets;
+ ///
+ /// let alphabetic = sets::alphabetic();
+ ///
+ /// assert!(!alphabetic.contains('3'));
+ /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
+ /// assert!(alphabetic.contains('A'));
+ /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
+ /// ```
+ #[inline]
+ pub fn contains(self, ch: char) -> bool {
+ self.set.contains(ch)
+ }
+
+ /// Check if the set contains a character as a UTF32 code unit
+ ///
+ /// ```rust
+ /// use icu_properties::sets;
+ ///
+ /// let alphabetic = sets::alphabetic();
+ ///
+ /// assert!(!alphabetic.contains32(0x0A69)); // U+0A69 GURMUKHI DIGIT THREE
+ /// assert!(alphabetic.contains32(0x00C4)); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
+ /// ```
+ #[inline]
+ pub fn contains32(self, ch: u32) -> bool {
+ self.set.contains32(ch)
+ }
+
+ // Yields an [`Iterator`] returning the ranges of the code points that are
+ /// included in the [`CodePointSetData`]
+ ///
+ /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
+ /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
+ /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let alphabetic = sets::alphabetic();
+ /// let mut ranges = alphabetic.iter_ranges();
+ ///
+ /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
+ /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
+ /// ```
+ #[inline]
+ pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
+ self.set.iter_ranges()
+ }
+
+ // Yields an [`Iterator`] returning the ranges of the code points that are
+ /// *not* included in the [`CodePointSetData`]
+ ///
+ /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
+ /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
+ /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let alphabetic = sets::alphabetic();
+ /// let mut ranges = alphabetic.iter_ranges();
+ ///
+ /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
+ /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
+ /// ```
+ #[inline]
+ pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
+ self.set.iter_ranges_complemented()
+ }
+}
+
+//
+// UnicodeSet* structs, impls, & macros
+// (a set with code points + strings)
+//
+
+/// A wrapper around `UnicodeSet` data (characters and strings)
+#[derive(Debug)]
+pub struct UnicodeSetData {
+ data: DataPayload<ErasedUnicodeSetlikeMarker>,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ErasedUnicodeSetlikeMarker;
+impl DataMarker for ErasedUnicodeSetlikeMarker {
+ type Yokeable = PropertyUnicodeSetV1<'static>;
+}
+
+impl UnicodeSetData {
+ /// Construct a borrowed version of this type that can be queried.
+ ///
+ /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
+ /// up front.
+ #[inline]
+ pub fn as_borrowed(&self) -> UnicodeSetDataBorrowed<'_> {
+ UnicodeSetDataBorrowed {
+ set: self.data.get(),
+ }
+ }
+
+ /// Construct a new one from loaded data
+ ///
+ /// Typically it is preferable to use getters instead
+ pub fn from_data<M>(data: DataPayload<M>) -> Self
+ where
+ M: DataMarker<Yokeable = PropertyUnicodeSetV1<'static>>,
+ {
+ Self { data: data.cast() }
+ }
+
+ /// Construct a new owned [`CodePointInversionListAndStringList`]
+ pub fn from_code_point_inversion_list_string_list(
+ set: CodePointInversionListAndStringList<'static>,
+ ) -> Self {
+ let set = PropertyUnicodeSetV1::from_code_point_inversion_list_string_list(set);
+ UnicodeSetData::from_data(DataPayload::<ErasedUnicodeSetlikeMarker>::from_owned(set))
+ }
+
+ /// Convert this type to a [`CodePointInversionListAndStringList`] as a borrowed value.
+ ///
+ /// The data backing this is extensible and supports multiple implementations.
+ /// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be
+ /// added, and users may select which at data generation time.
+ ///
+ /// This method returns an `Option` in order to return `None` when the backing data provider
+ /// cannot return a [`CodePointInversionListAndStringList`], or cannot do so within the expected constant time
+ /// constraint.
+ pub fn as_code_point_inversion_list_string_list(
+ &self,
+ ) -> Option<&CodePointInversionListAndStringList<'_>> {
+ self.data.get().as_code_point_inversion_list_string_list()
+ }
+
+ /// Convert this type to a [`CodePointInversionListAndStringList`], borrowing if possible,
+ /// otherwise allocating a new [`CodePointInversionListAndStringList`].
+ ///
+ /// The data backing this is extensible and supports multiple implementations.
+ /// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be
+ /// added, and users may select which at data generation time.
+ ///
+ /// The performance of the conversion to this specific return type will vary
+ /// depending on the data structure that is backing `self`.
+ pub fn to_code_point_inversion_list_string_list(
+ &self,
+ ) -> CodePointInversionListAndStringList<'_> {
+ self.data.get().to_code_point_inversion_list_string_list()
+ }
+}
+
+/// A borrowed wrapper around code point set data, returned by
+/// [`UnicodeSetData::as_borrowed()`]. More efficient to query.
+#[derive(Clone, Copy, Debug)]
+pub struct UnicodeSetDataBorrowed<'a> {
+ set: &'a PropertyUnicodeSetV1<'a>,
+}
+
+impl<'a> UnicodeSetDataBorrowed<'a> {
+ /// Check if the set contains the string. Strings consisting of one character
+ /// are treated as a character/code point.
+ ///
+ /// This matches ICU behavior for ICU's `UnicodeSet`.
+ #[inline]
+ pub fn contains(self, s: &str) -> bool {
+ self.set.contains(s)
+ }
+
+ /// Check if the set contains a character as a UTF32 code unit
+ #[inline]
+ pub fn contains32(&self, cp: u32) -> bool {
+ self.set.contains32(cp)
+ }
+
+ /// Check if the set contains the code point corresponding to the Rust character.
+ #[inline]
+ pub fn contains_char(&self, ch: char) -> bool {
+ self.set.contains_char(ch)
+ }
+}
+
+impl UnicodeSetDataBorrowed<'static> {
+ /// Cheaply converts a `UnicodeSetDataBorrowed<'static>` into a `UnicodeSetData`.
+ pub const fn static_to_owned(self) -> UnicodeSetData {
+ UnicodeSetData {
+ data: DataPayload::from_static_ref(self.set),
+ }
+ }
+}
+
+pub(crate) fn load_set_data<M, P>(provider: &P) -> Result<CodePointSetData, PropertiesError>
+where
+ M: KeyedDataMarker<Yokeable = PropertyCodePointSetV1<'static>>,
+ P: DataProvider<M> + ?Sized,
+{
+ Ok(provider
+ .load(Default::default())
+ .and_then(DataResponse::take_payload)
+ .map(CodePointSetData::from_data)?)
+}
+
+//
+// Binary property getter fns
+// (data as code point sets)
+//
+
+macro_rules! make_code_point_set_property {
+ (
+ // currently unused
+ property: $property:expr;
+ // currently unused
+ marker: $marker_name:ident;
+ keyed_data_marker: $keyed_data_marker:ty;
+ func:
+ $(#[$doc:meta])+
+ $cvis:vis const fn $constname:ident() => $singleton_name:ident;
+ $vis:vis fn $funcname:ident();
+ ) => {
+ #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")]
+ ///
+ /// Note that this will return an owned version of the data. Functionality is available on
+ /// the borrowed version, accessible through [`CodePointSetData::as_borrowed`].
+ $vis fn $funcname(
+ provider: &(impl DataProvider<$keyed_data_marker> + ?Sized)
+ ) -> Result<CodePointSetData, PropertiesError> {
+ load_set_data(provider)
+ }
+
+ $(#[$doc])*
+ #[cfg(feature = "compiled_data")]
+ $cvis const fn $constname() -> CodePointSetDataBorrowed<'static> {
+ CodePointSetDataBorrowed {
+ set: crate::provider::Baked::$singleton_name,
+ }
+ }
+ }
+}
+
+make_code_point_set_property! {
+ property: "ASCII_Hex_Digit";
+ marker: AsciiHexDigitProperty;
+ keyed_data_marker: AsciiHexDigitV1Marker;
+ func:
+ /// ASCII characters commonly used for the representation of hexadecimal numbers
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let ascii_hex_digit = sets::ascii_hex_digit();
+ ///
+ /// assert!(ascii_hex_digit.contains('3'));
+ /// assert!(!ascii_hex_digit.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
+ /// assert!(ascii_hex_digit.contains('A'));
+ /// assert!(!ascii_hex_digit.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
+ /// ```
+ pub const fn ascii_hex_digit() => SINGLETON_PROPS_AHEX_V1;
+ pub fn load_ascii_hex_digit();
+}
+
+make_code_point_set_property! {
+ property: "Alnum";
+ marker: AlnumProperty;
+ keyed_data_marker: AlnumV1Marker;
+ func:
+ /// Characters with the Alphabetic or Decimal_Number property
+ /// This is defined for POSIX compatibility.
+
+ pub const fn alnum() => SINGLETON_PROPS_ALNUM_V1;
+ pub fn load_alnum();
+}
+
+make_code_point_set_property! {
+ property: "Alphabetic";
+ marker: AlphabeticProperty;
+ keyed_data_marker: AlphabeticV1Marker;
+ func:
+ /// Alphabetic characters
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let alphabetic = sets::alphabetic();
+ ///
+ /// assert!(!alphabetic.contains('3'));
+ /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
+ /// assert!(alphabetic.contains('A'));
+ /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
+ /// ```
+
+ pub const fn alphabetic() => SINGLETON_PROPS_ALPHA_V1;
+ pub fn load_alphabetic();
+}
+
+make_code_point_set_property! {
+ property: "Bidi_Control";
+ marker: BidiControlProperty;
+ keyed_data_marker: BidiControlV1Marker;
+ func:
+ /// Format control characters which have specific functions in the Unicode Bidirectional
+ /// Algorithm
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let bidi_control = sets::bidi_control();
+ ///
+ /// assert!(bidi_control.contains32(0x200F)); // RIGHT-TO-LEFT MARK
+ /// assert!(!bidi_control.contains('ش')); // U+0634 ARABIC LETTER SHEEN
+ /// ```
+
+ pub const fn bidi_control() => SINGLETON_PROPS_BIDI_C_V1;
+ pub fn load_bidi_control();
+}
+
+make_code_point_set_property! {
+ property: "Bidi_Mirrored";
+ marker: BidiMirroredProperty;
+ keyed_data_marker: BidiMirroredV1Marker;
+ func:
+ /// Characters that are mirrored in bidirectional text
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let bidi_mirrored = sets::bidi_mirrored();
+ ///
+ /// assert!(bidi_mirrored.contains('['));
+ /// assert!(bidi_mirrored.contains(']'));
+ /// assert!(bidi_mirrored.contains('∑')); // U+2211 N-ARY SUMMATION
+ /// assert!(!bidi_mirrored.contains('ཉ')); // U+0F49 TIBETAN LETTER NYA
+ /// ```
+
+ pub const fn bidi_mirrored() => SINGLETON_PROPS_BIDI_M_V1;
+ pub fn load_bidi_mirrored();
+}
+
+make_code_point_set_property! {
+ property: "Blank";
+ marker: BlankProperty;
+ keyed_data_marker: BlankV1Marker;
+ func:
+ /// Horizontal whitespace characters
+
+ pub const fn blank() => SINGLETON_PROPS_BLANK_V1;
+ pub fn load_blank();
+}
+
+make_code_point_set_property! {
+ property: "Cased";
+ marker: CasedProperty;
+ keyed_data_marker: CasedV1Marker;
+ func:
+ /// Uppercase, lowercase, and titlecase characters
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let cased = sets::cased();
+ ///
+ /// assert!(cased.contains('Ꙡ')); // U+A660 CYRILLIC CAPITAL LETTER REVERSED TSE
+ /// assert!(!cased.contains('ދ')); // U+078B THAANA LETTER DHAALU
+ /// ```
+
+ pub const fn cased() => SINGLETON_PROPS_CASED_V1;
+ pub fn load_cased();
+}
+
+make_code_point_set_property! {
+ property: "Case_Ignorable";
+ marker: CaseIgnorableProperty;
+ keyed_data_marker: CaseIgnorableV1Marker;
+ func:
+ /// Characters which are ignored for casing purposes
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let case_ignorable = sets::case_ignorable();
+ ///
+ /// assert!(case_ignorable.contains(':'));
+ /// assert!(!case_ignorable.contains('λ')); // U+03BB GREEK SMALL LETTER LAMDA
+ /// ```
+
+ pub const fn case_ignorable() => SINGLETON_PROPS_CI_V1;
+ pub fn load_case_ignorable();
+}
+
+make_code_point_set_property! {
+ property: "Full_Composition_Exclusion";
+ marker: FullCompositionExclusionProperty;
+ keyed_data_marker: FullCompositionExclusionV1Marker;
+ func:
+ /// Characters that are excluded from composition
+ /// See <https://unicode.org/Public/UNIDATA/CompositionExclusions.txt>
+
+ pub const fn full_composition_exclusion() => SINGLETON_PROPS_COMP_EX_V1;
+ pub fn load_full_composition_exclusion();
+}
+
+make_code_point_set_property! {
+ property: "Changes_When_Casefolded";
+ marker: ChangesWhenCasefoldedProperty;
+ keyed_data_marker: ChangesWhenCasefoldedV1Marker;
+ func:
+ /// Characters whose normalized forms are not stable under case folding
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let changes_when_casefolded = sets::changes_when_casefolded();
+ ///
+ /// assert!(changes_when_casefolded.contains('ß')); // U+00DF LATIN SMALL LETTER SHARP S
+ /// assert!(!changes_when_casefolded.contains('ᜉ')); // U+1709 TAGALOG LETTER PA
+ /// ```
+
+ pub const fn changes_when_casefolded() => SINGLETON_PROPS_CWCF_V1;
+ pub fn load_changes_when_casefolded();
+}
+
+make_code_point_set_property! {
+ property: "Changes_When_Casemapped";
+ marker: ChangesWhenCasemappedProperty;
+ keyed_data_marker: ChangesWhenCasemappedV1Marker;
+ func:
+ /// Characters which may change when they undergo case mapping
+
+ pub const fn changes_when_casemapped() => SINGLETON_PROPS_CWCM_V1;
+ pub fn load_changes_when_casemapped();
+}
+
+make_code_point_set_property! {
+ property: "Changes_When_NFKC_Casefolded";
+ marker: ChangesWhenNfkcCasefoldedProperty;
+ keyed_data_marker: ChangesWhenNfkcCasefoldedV1Marker;
+ func:
+ /// Characters which are not identical to their NFKC_Casefold mapping
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let changes_when_nfkc_casefolded = sets::changes_when_nfkc_casefolded();
+ ///
+ /// assert!(changes_when_nfkc_casefolded.contains('🄵')); // U+1F135 SQUARED LATIN CAPITAL LETTER F
+ /// assert!(!changes_when_nfkc_casefolded.contains('f'));
+ /// ```
+
+ pub const fn changes_when_nfkc_casefolded() => SINGLETON_PROPS_CWKCF_V1;
+ pub fn load_changes_when_nfkc_casefolded();
+}
+
+make_code_point_set_property! {
+ property: "Changes_When_Lowercased";
+ marker: ChangesWhenLowercasedProperty;
+ keyed_data_marker: ChangesWhenLowercasedV1Marker;
+ func:
+ /// Characters whose normalized forms are not stable under a toLowercase mapping
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let changes_when_lowercased = sets::changes_when_lowercased();
+ ///
+ /// assert!(changes_when_lowercased.contains('Ⴔ')); // U+10B4 GEORGIAN CAPITAL LETTER PHAR
+ /// assert!(!changes_when_lowercased.contains('ფ')); // U+10E4 GEORGIAN LETTER PHAR
+ /// ```
+
+ pub const fn changes_when_lowercased() => SINGLETON_PROPS_CWL_V1;
+ pub fn load_changes_when_lowercased();
+}
+
+make_code_point_set_property! {
+ property: "Changes_When_Titlecased";
+ marker: ChangesWhenTitlecasedProperty;
+ keyed_data_marker: ChangesWhenTitlecasedV1Marker;
+ func:
+ /// Characters whose normalized forms are not stable under a toTitlecase mapping
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let changes_when_titlecased = sets::changes_when_titlecased();
+ ///
+ /// assert!(changes_when_titlecased.contains('æ')); // U+00E6 LATIN SMALL LETTER AE
+ /// assert!(!changes_when_titlecased.contains('Æ')); // U+00E6 LATIN CAPITAL LETTER AE
+ /// ```
+
+ pub const fn changes_when_titlecased() => SINGLETON_PROPS_CWT_V1;
+ pub fn load_changes_when_titlecased();
+}
+
+make_code_point_set_property! {
+ property: "Changes_When_Uppercased";
+ marker: ChangesWhenUppercasedProperty;
+ keyed_data_marker: ChangesWhenUppercasedV1Marker;
+ func:
+ /// Characters whose normalized forms are not stable under a toUppercase mapping
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let changes_when_uppercased = sets::changes_when_uppercased();
+ ///
+ /// assert!(changes_when_uppercased.contains('ւ')); // U+0582 ARMENIAN SMALL LETTER YIWN
+ /// assert!(!changes_when_uppercased.contains('Ւ')); // U+0552 ARMENIAN CAPITAL LETTER YIWN
+ /// ```
+
+ pub const fn changes_when_uppercased() => SINGLETON_PROPS_CWU_V1;
+ pub fn load_changes_when_uppercased();
+}
+
+make_code_point_set_property! {
+ property: "Dash";
+ marker: DashProperty;
+ keyed_data_marker: DashV1Marker;
+ func:
+ /// Punctuation characters explicitly called out as dashes in the Unicode Standard, plus
+ /// their compatibility equivalents
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let dash = sets::dash();
+ ///
+ /// assert!(dash.contains('⸺')); // U+2E3A TWO-EM DASH
+ /// assert!(dash.contains('-')); // U+002D
+ /// assert!(!dash.contains('=')); // U+003D
+ /// ```
+
+ pub const fn dash() => SINGLETON_PROPS_DASH_V1;
+ pub fn load_dash();
+}
+
+make_code_point_set_property! {
+ property: "Deprecated";
+ marker: DeprecatedProperty;
+ keyed_data_marker: DeprecatedV1Marker;
+ func:
+ /// Deprecated characters. No characters will ever be removed from the standard, but the
+ /// usage of deprecated characters is strongly discouraged.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let deprecated = sets::deprecated();
+ ///
+ /// assert!(deprecated.contains('ឣ')); // U+17A3 KHMER INDEPENDENT VOWEL QAQ
+ /// assert!(!deprecated.contains('A'));
+ /// ```
+
+ pub const fn deprecated() => SINGLETON_PROPS_DEP_V1;
+ pub fn load_deprecated();
+}
+
+make_code_point_set_property! {
+ property: "Default_Ignorable_Code_Point";
+ marker: DefaultIgnorableCodePointProperty;
+ keyed_data_marker: DefaultIgnorableCodePointV1Marker;
+ func:
+ /// For programmatic determination of default ignorable code points. New characters that
+ /// should be ignored in rendering (unless explicitly supported) will be assigned in these
+ /// ranges, permitting programs to correctly handle the default rendering of such
+ /// characters when not otherwise supported.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let default_ignorable_code_point = sets::default_ignorable_code_point();
+ ///
+ /// assert!(default_ignorable_code_point.contains32(0x180B)); // MONGOLIAN FREE VARIATION SELECTOR ONE
+ /// assert!(!default_ignorable_code_point.contains('E'));
+ /// ```
+
+ pub const fn default_ignorable_code_point() => SINGLETON_PROPS_DI_V1;
+ pub fn load_default_ignorable_code_point();
+}
+
+make_code_point_set_property! {
+ property: "Diacritic";
+ marker: DiacriticProperty;
+ keyed_data_marker: DiacriticV1Marker;
+ func:
+ /// Characters that linguistically modify the meaning of another character to which they apply
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let diacritic = sets::diacritic();
+ ///
+ /// assert!(diacritic.contains('\u{05B3}')); // HEBREW POINT HATAF QAMATS
+ /// assert!(!diacritic.contains('א')); // U+05D0 HEBREW LETTER ALEF
+ /// ```
+
+ pub const fn diacritic() => SINGLETON_PROPS_DIA_V1;
+ pub fn load_diacritic();
+}
+
+make_code_point_set_property! {
+ property: "Emoji_Modifier_Base";
+ marker: EmojiModifierBaseProperty;
+ keyed_data_marker: EmojiModifierBaseV1Marker;
+ func:
+ /// Characters that can serve as a base for emoji modifiers
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let emoji_modifier_base = sets::emoji_modifier_base();
+ ///
+ /// assert!(emoji_modifier_base.contains('✊')); // U+270A RAISED FIST
+ /// assert!(!emoji_modifier_base.contains('⛰')); // U+26F0 MOUNTAIN
+ /// ```
+
+ pub const fn emoji_modifier_base() => SINGLETON_PROPS_EBASE_V1;
+ pub fn load_emoji_modifier_base();
+}
+
+make_code_point_set_property! {
+ property: "Emoji_Component";
+ marker: EmojiComponentProperty;
+ keyed_data_marker: EmojiComponentV1Marker;
+ func:
+ /// Characters used in emoji sequences that normally do not appear on emoji keyboards as
+ /// separate choices, such as base characters for emoji keycaps
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let emoji_component = sets::emoji_component();
+ ///
+ /// assert!(emoji_component.contains('🇹')); // U+1F1F9 REGIONAL INDICATOR SYMBOL LETTER T
+ /// assert!(emoji_component.contains32(0x20E3)); // COMBINING ENCLOSING KEYCAP
+ /// assert!(emoji_component.contains('7'));
+ /// assert!(!emoji_component.contains('T'));
+ /// ```
+
+ pub const fn emoji_component() => SINGLETON_PROPS_ECOMP_V1;
+ pub fn load_emoji_component();
+}
+
+make_code_point_set_property! {
+ property: "Emoji_Modifier";
+ marker: EmojiModifierProperty;
+ keyed_data_marker: EmojiModifierV1Marker;
+ func:
+ /// Characters that are emoji modifiers
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let emoji_modifier = sets::emoji_modifier();
+ ///
+ /// assert!(emoji_modifier.contains32(0x1F3FD)); // EMOJI MODIFIER FITZPATRICK TYPE-4
+ /// assert!(!emoji_modifier.contains32(0x200C)); // ZERO WIDTH NON-JOINER
+ /// ```
+
+ pub const fn emoji_modifier() => SINGLETON_PROPS_EMOD_V1;
+ pub fn load_emoji_modifier();
+}
+
+make_code_point_set_property! {
+ property: "Emoji";
+ marker: EmojiProperty;
+ keyed_data_marker: EmojiV1Marker;
+ func:
+ /// Characters that are emoji
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let emoji = sets::emoji();
+ ///
+ /// assert!(emoji.contains('🔥')); // U+1F525 FIRE
+ /// assert!(!emoji.contains('V'));
+ /// ```
+
+ pub const fn emoji() => SINGLETON_PROPS_EMOJI_V1;
+ pub fn load_emoji();
+}
+
+make_code_point_set_property! {
+ property: "Emoji_Presentation";
+ marker: EmojiPresentationProperty;
+ keyed_data_marker: EmojiPresentationV1Marker;
+ func:
+ /// Characters that have emoji presentation by default
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let emoji_presentation = sets::emoji_presentation();
+ ///
+ /// assert!(emoji_presentation.contains('🦬')); // U+1F9AC BISON
+ /// assert!(!emoji_presentation.contains('♻')); // U+267B BLACK UNIVERSAL RECYCLING SYMBOL
+ /// ```
+
+ pub const fn emoji_presentation() => SINGLETON_PROPS_EPRES_V1;
+ pub fn load_emoji_presentation();
+}
+
+make_code_point_set_property! {
+ property: "Extender";
+ marker: ExtenderProperty;
+ keyed_data_marker: ExtenderV1Marker;
+ func:
+ /// Characters whose principal function is to extend the value of a preceding alphabetic
+ /// character or to extend the shape of adjacent characters.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let extender = sets::extender();
+ ///
+ /// assert!(extender.contains('ヾ')); // U+30FE KATAKANA VOICED ITERATION MARK
+ /// assert!(extender.contains('ー')); // U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK
+ /// assert!(!extender.contains('・')); // U+30FB KATAKANA MIDDLE DOT
+ /// ```
+
+ pub const fn extender() => SINGLETON_PROPS_EXT_V1;
+ pub fn load_extender();
+}
+
+make_code_point_set_property! {
+ property: "Extended_Pictographic";
+ marker: ExtendedPictographicProperty;
+ keyed_data_marker: ExtendedPictographicV1Marker;
+ func:
+ /// Pictographic symbols, as well as reserved ranges in blocks largely associated with
+ /// emoji characters
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let extended_pictographic = sets::extended_pictographic();
+ ///
+ /// assert!(extended_pictographic.contains('🥳')); // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
+ /// assert!(!extended_pictographic.contains('🇪')); // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E
+ /// ```
+
+ pub const fn extended_pictographic() => SINGLETON_PROPS_EXTPICT_V1;
+ pub fn load_extended_pictographic();
+}
+
+make_code_point_set_property! {
+ property: "Graph";
+ marker: GraphProperty;
+ keyed_data_marker: GraphV1Marker;
+ func:
+ /// Visible characters.
+ /// This is defined for POSIX compatibility.
+
+ pub const fn graph() => SINGLETON_PROPS_GRAPH_V1;
+ pub fn load_graph();
+}
+
+make_code_point_set_property! {
+ property: "Grapheme_Base";
+ marker: GraphemeBaseProperty;
+ keyed_data_marker: GraphemeBaseV1Marker;
+ func:
+ /// Property used together with the definition of Standard Korean Syllable Block to define
+ /// "Grapheme base". See D58 in Chapter 3, Conformance in the Unicode Standard.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let grapheme_base = sets::grapheme_base();
+ ///
+ /// assert!(grapheme_base.contains('ക')); // U+0D15 MALAYALAM LETTER KA
+ /// assert!(grapheme_base.contains('\u{0D3F}')); // U+0D3F MALAYALAM VOWEL SIGN I
+ /// assert!(!grapheme_base.contains('\u{0D3E}')); // U+0D3E MALAYALAM VOWEL SIGN AA
+ /// ```
+
+ pub const fn grapheme_base() => SINGLETON_PROPS_GR_BASE_V1;
+ pub fn load_grapheme_base();
+}
+
+make_code_point_set_property! {
+ property: "Grapheme_Extend";
+ marker: GraphemeExtendProperty;
+ keyed_data_marker: GraphemeExtendV1Marker;
+ func:
+ /// Property used to define "Grapheme extender". See D59 in Chapter 3, Conformance in the
+ /// Unicode Standard.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let grapheme_extend = sets::grapheme_extend();
+ ///
+ /// assert!(!grapheme_extend.contains('ക')); // U+0D15 MALAYALAM LETTER KA
+ /// assert!(!grapheme_extend.contains('\u{0D3F}')); // U+0D3F MALAYALAM VOWEL SIGN I
+ /// assert!(grapheme_extend.contains('\u{0D3E}')); // U+0D3E MALAYALAM VOWEL SIGN AA
+ /// ```
+
+ pub const fn grapheme_extend() => SINGLETON_PROPS_GR_EXT_V1;
+ pub fn load_grapheme_extend();
+}
+
+make_code_point_set_property! {
+ property: "Grapheme_Link";
+ marker: GraphemeLinkProperty;
+ keyed_data_marker: GraphemeLinkV1Marker;
+ func:
+ /// Deprecated property. Formerly proposed for programmatic determination of grapheme
+ /// cluster boundaries.
+
+ pub const fn grapheme_link() => SINGLETON_PROPS_GR_LINK_V1;
+ pub fn load_grapheme_link();
+}
+
+make_code_point_set_property! {
+ property: "Hex_Digit";
+ marker: HexDigitProperty;
+ keyed_data_marker: HexDigitV1Marker;
+ func:
+ /// Characters commonly used for the representation of hexadecimal numbers, plus their
+ /// compatibility equivalents
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let hex_digit = sets::hex_digit();
+ ///
+ /// assert!(hex_digit.contains('0'));
+ /// assert!(!hex_digit.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
+ /// assert!(hex_digit.contains('f'));
+ /// assert!(hex_digit.contains('f')); // U+FF46 FULLWIDTH LATIN SMALL LETTER F
+ /// assert!(hex_digit.contains('F')); // U+FF26 FULLWIDTH LATIN CAPITAL LETTER F
+ /// assert!(!hex_digit.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
+ /// ```
+
+ pub const fn hex_digit() => SINGLETON_PROPS_HEX_V1;
+ pub fn load_hex_digit();
+}
+
+make_code_point_set_property! {
+ property: "Hyphen";
+ marker: HyphenProperty;
+ keyed_data_marker: HyphenV1Marker;
+ func:
+ /// Deprecated property. Dashes which are used to mark connections between pieces of
+ /// words, plus the Katakana middle dot.
+
+ pub const fn hyphen() => SINGLETON_PROPS_HYPHEN_V1;
+ pub fn load_hyphen();
+}
+
+make_code_point_set_property! {
+ property: "Id_Continue";
+ marker: IdContinueProperty;
+ keyed_data_marker: IdContinueV1Marker;
+ func:
+ /// Characters that can come after the first character in an identifier. If using NFKC to
+ /// fold differences between characters, use [`load_xid_continue`] instead. See
+ /// [`Unicode Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for
+ /// more details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let id_continue = sets::id_continue();
+ ///
+ /// assert!(id_continue.contains('x'));
+ /// assert!(id_continue.contains('1'));
+ /// assert!(id_continue.contains('_'));
+ /// assert!(id_continue.contains('ߝ')); // U+07DD NKO LETTER FA
+ /// assert!(!id_continue.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X
+ /// assert!(id_continue.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ /// ```
+
+ pub const fn id_continue() => SINGLETON_PROPS_IDC_V1;
+ pub fn load_id_continue();
+}
+
+make_code_point_set_property! {
+ property: "Ideographic";
+ marker: IdeographicProperty;
+ keyed_data_marker: IdeographicV1Marker;
+ func:
+ /// Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese)
+ /// ideographs, or related siniform ideographs
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let ideographic = sets::ideographic();
+ ///
+ /// assert!(ideographic.contains('川')); // U+5DDD CJK UNIFIED IDEOGRAPH-5DDD
+ /// assert!(!ideographic.contains('밥')); // U+BC25 HANGUL SYLLABLE BAB
+ /// ```
+
+ pub const fn ideographic() => SINGLETON_PROPS_IDEO_V1;
+ pub fn load_ideographic();
+}
+
+make_code_point_set_property! {
+ property: "Id_Start";
+ marker: IdStartProperty;
+ keyed_data_marker: IdStartV1Marker;
+ func:
+ /// Characters that can begin an identifier. If using NFKC to fold differences between
+ /// characters, use [`load_xid_start`] instead. See [`Unicode Standard Annex
+ /// #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let id_start = sets::id_start();
+ ///
+ /// assert!(id_start.contains('x'));
+ /// assert!(!id_start.contains('1'));
+ /// assert!(!id_start.contains('_'));
+ /// assert!(id_start.contains('ߝ')); // U+07DD NKO LETTER FA
+ /// assert!(!id_start.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X
+ /// assert!(id_start.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ /// ```
+
+ pub const fn id_start() => SINGLETON_PROPS_IDS_V1;
+ pub fn load_id_start();
+}
+
+make_code_point_set_property! {
+ property: "Ids_Binary_Operator";
+ marker: IdsBinaryOperatorProperty;
+ keyed_data_marker: IdsBinaryOperatorV1Marker;
+ func:
+ /// Characters used in Ideographic Description Sequences
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let ids_binary_operator = sets::ids_binary_operator();
+ ///
+ /// assert!(ids_binary_operator.contains32(0x2FF5)); // IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE
+ /// assert!(!ids_binary_operator.contains32(0x3006)); // IDEOGRAPHIC CLOSING MARK
+ /// ```
+
+ pub const fn ids_binary_operator() => SINGLETON_PROPS_IDSB_V1;
+ pub fn load_ids_binary_operator();
+}
+
+make_code_point_set_property! {
+ property: "Ids_Trinary_Operator";
+ marker: IdsTrinaryOperatorProperty;
+ keyed_data_marker: IdsTrinaryOperatorV1Marker;
+ func:
+ /// Characters used in Ideographic Description Sequences
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let ids_trinary_operator = sets::ids_trinary_operator();
+ ///
+ /// assert!(ids_trinary_operator.contains32(0x2FF2)); // IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT
+ /// assert!(ids_trinary_operator.contains32(0x2FF3)); // IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW
+ /// assert!(!ids_trinary_operator.contains32(0x2FF4));
+ /// assert!(!ids_trinary_operator.contains32(0x2FF5)); // IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE
+ /// assert!(!ids_trinary_operator.contains32(0x3006)); // IDEOGRAPHIC CLOSING MARK
+ /// ```
+
+ pub const fn ids_trinary_operator() => SINGLETON_PROPS_IDST_V1;
+ pub fn load_ids_trinary_operator();
+}
+
+make_code_point_set_property! {
+ property: "Join_Control";
+ marker: JoinControlProperty;
+ keyed_data_marker: JoinControlV1Marker;
+ func:
+ /// Format control characters which have specific functions for control of cursive joining
+ /// and ligation
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let join_control = sets::join_control();
+ ///
+ /// assert!(join_control.contains32(0x200C)); // ZERO WIDTH NON-JOINER
+ /// assert!(join_control.contains32(0x200D)); // ZERO WIDTH JOINER
+ /// assert!(!join_control.contains32(0x200E));
+ /// ```
+
+ pub const fn join_control() => SINGLETON_PROPS_JOIN_C_V1;
+ pub fn load_join_control();
+}
+
+make_code_point_set_property! {
+ property: "Logical_Order_Exception";
+ marker: LogicalOrderExceptionProperty;
+ keyed_data_marker: LogicalOrderExceptionV1Marker;
+ func:
+ /// A small number of spacing vowel letters occurring in certain Southeast Asian scripts such as Thai and Lao
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let logical_order_exception = sets::logical_order_exception();
+ ///
+ /// assert!(logical_order_exception.contains('ແ')); // U+0EC1 LAO VOWEL SIGN EI
+ /// assert!(!logical_order_exception.contains('ະ')); // U+0EB0 LAO VOWEL SIGN A
+ /// ```
+
+ pub const fn logical_order_exception() => SINGLETON_PROPS_LOE_V1;
+ pub fn load_logical_order_exception();
+}
+
+make_code_point_set_property! {
+ property: "Lowercase";
+ marker: LowercaseProperty;
+ keyed_data_marker: LowercaseV1Marker;
+ func:
+ /// Lowercase characters
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let lowercase = sets::lowercase();
+ ///
+ /// assert!(lowercase.contains('a'));
+ /// assert!(!lowercase.contains('A'));
+ /// ```
+
+ pub const fn lowercase() => SINGLETON_PROPS_LOWER_V1;
+ pub fn load_lowercase();
+}
+
+make_code_point_set_property! {
+ property: "Math";
+ marker: MathProperty;
+ keyed_data_marker: MathV1Marker;
+ func:
+ /// Characters used in mathematical notation
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let math = sets::math();
+ ///
+ /// assert!(math.contains('='));
+ /// assert!(math.contains('+'));
+ /// assert!(!math.contains('-'));
+ /// assert!(math.contains('−')); // U+2212 MINUS SIGN
+ /// assert!(!math.contains('/'));
+ /// assert!(math.contains('∕')); // U+2215 DIVISION SLASH
+ /// ```
+
+ pub const fn math() => SINGLETON_PROPS_MATH_V1;
+ pub fn load_math();
+}
+
+make_code_point_set_property! {
+ property: "Noncharacter_Code_Point";
+ marker: NoncharacterCodePointProperty;
+ keyed_data_marker: NoncharacterCodePointV1Marker;
+ func:
+ /// Code points permanently reserved for internal use
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let noncharacter_code_point = sets::noncharacter_code_point();
+ ///
+ /// assert!(noncharacter_code_point.contains32(0xFDD0));
+ /// assert!(noncharacter_code_point.contains32(0xFFFF));
+ /// assert!(!noncharacter_code_point.contains32(0x10000));
+ /// ```
+
+ pub const fn noncharacter_code_point() => SINGLETON_PROPS_NCHAR_V1;
+ pub fn load_noncharacter_code_point();
+}
+
+make_code_point_set_property! {
+ property: "NFC_Inert";
+ marker: NfcInertProperty;
+ keyed_data_marker: NfcInertV1Marker;
+ func:
+ /// Characters that are inert under NFC, i.e., they do not interact with adjacent characters
+
+ pub const fn nfc_inert() => SINGLETON_PROPS_NFCINERT_V1;
+ pub fn load_nfc_inert();
+}
+
+make_code_point_set_property! {
+ property: "NFD_Inert";
+ marker: NfdInertProperty;
+ keyed_data_marker: NfdInertV1Marker;
+ func:
+ /// Characters that are inert under NFD, i.e., they do not interact with adjacent characters
+
+ pub const fn nfd_inert() => SINGLETON_PROPS_NFDINERT_V1;
+ pub fn load_nfd_inert();
+}
+
+make_code_point_set_property! {
+ property: "NFKC_Inert";
+ marker: NfkcInertProperty;
+ keyed_data_marker: NfkcInertV1Marker;
+ func:
+ /// Characters that are inert under NFKC, i.e., they do not interact with adjacent characters
+
+ pub const fn nfkc_inert() => SINGLETON_PROPS_NFKCINERT_V1;
+ pub fn load_nfkc_inert();
+}
+
+make_code_point_set_property! {
+ property: "NFKD_Inert";
+ marker: NfkdInertProperty;
+ keyed_data_marker: NfkdInertV1Marker;
+ func:
+ /// Characters that are inert under NFKD, i.e., they do not interact with adjacent characters
+
+ pub const fn nfkd_inert() => SINGLETON_PROPS_NFKDINERT_V1;
+ pub fn load_nfkd_inert();
+}
+
+make_code_point_set_property! {
+ property: "Pattern_Syntax";
+ marker: PatternSyntaxProperty;
+ keyed_data_marker: PatternSyntaxV1Marker;
+ func:
+ /// Characters used as syntax in patterns (such as regular expressions). See [`Unicode
+ /// Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more
+ /// details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let pattern_syntax = sets::pattern_syntax();
+ ///
+ /// assert!(pattern_syntax.contains('{'));
+ /// assert!(pattern_syntax.contains('⇒')); // U+21D2 RIGHTWARDS DOUBLE ARROW
+ /// assert!(!pattern_syntax.contains('0'));
+ /// ```
+
+ pub const fn pattern_syntax() => SINGLETON_PROPS_PAT_SYN_V1;
+ pub fn load_pattern_syntax();
+}
+
+make_code_point_set_property! {
+ property: "Pattern_White_Space";
+ marker: PatternWhiteSpaceProperty;
+ keyed_data_marker: PatternWhiteSpaceV1Marker;
+ func:
+ /// Characters used as whitespace in patterns (such as regular expressions). See
+ /// [`Unicode Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for
+ /// more details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let pattern_white_space = sets::pattern_white_space();
+ ///
+ /// assert!(pattern_white_space.contains(' '));
+ /// assert!(pattern_white_space.contains32(0x2029)); // PARAGRAPH SEPARATOR
+ /// assert!(pattern_white_space.contains32(0x000A)); // NEW LINE
+ /// assert!(!pattern_white_space.contains32(0x00A0)); // NO-BREAK SPACE
+ /// ```
+
+ pub const fn pattern_white_space() => SINGLETON_PROPS_PAT_WS_V1;
+ pub fn load_pattern_white_space();
+}
+
+make_code_point_set_property! {
+ property: "Prepended_Concatenation_Mark";
+ marker: PrependedConcatenationMarkProperty;
+ keyed_data_marker: PrependedConcatenationMarkV1Marker;
+ func:
+ /// A small class of visible format controls, which precede and then span a sequence of
+ /// other characters, usually digits.
+
+ pub const fn prepended_concatenation_mark() => SINGLETON_PROPS_PCM_V1;
+ pub fn load_prepended_concatenation_mark();
+}
+
+make_code_point_set_property! {
+ property: "Print";
+ marker: PrintProperty;
+ keyed_data_marker: PrintV1Marker;
+ func:
+ /// Printable characters (visible characters and whitespace).
+ /// This is defined for POSIX compatibility.
+
+ pub const fn print() => SINGLETON_PROPS_PRINT_V1;
+ pub fn load_print();
+}
+
+make_code_point_set_property! {
+ property: "Quotation_Mark";
+ marker: QuotationMarkProperty;
+ keyed_data_marker: QuotationMarkV1Marker;
+ func:
+ /// Punctuation characters that function as quotation marks.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let quotation_mark = sets::quotation_mark();
+ ///
+ /// assert!(quotation_mark.contains('\''));
+ /// assert!(quotation_mark.contains('„')); // U+201E DOUBLE LOW-9 QUOTATION MARK
+ /// assert!(!quotation_mark.contains('<'));
+ /// ```
+
+ pub const fn quotation_mark() => SINGLETON_PROPS_QMARK_V1;
+ pub fn load_quotation_mark();
+}
+
+make_code_point_set_property! {
+ property: "Radical";
+ marker: RadicalProperty;
+ keyed_data_marker: RadicalV1Marker;
+ func:
+ /// Characters used in the definition of Ideographic Description Sequences
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let radical = sets::radical();
+ ///
+ /// assert!(radical.contains('⺆')); // U+2E86 CJK RADICAL BOX
+ /// assert!(!radical.contains('丹')); // U+F95E CJK COMPATIBILITY IDEOGRAPH-F95E
+ /// ```
+
+ pub const fn radical() => SINGLETON_PROPS_RADICAL_V1;
+ pub fn load_radical();
+}
+
+make_code_point_set_property! {
+ property: "Regional_Indicator";
+ marker: RegionalIndicatorProperty;
+ keyed_data_marker: RegionalIndicatorV1Marker;
+ func:
+ /// Regional indicator characters, U+1F1E6..U+1F1FF
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let regional_indicator = sets::regional_indicator();
+ ///
+ /// assert!(regional_indicator.contains('🇹')); // U+1F1F9 REGIONAL INDICATOR SYMBOL LETTER T
+ /// assert!(!regional_indicator.contains('Ⓣ')); // U+24C9 CIRCLED LATIN CAPITAL LETTER T
+ /// assert!(!regional_indicator.contains('T'));
+ /// ```
+
+ pub const fn regional_indicator() => SINGLETON_PROPS_RI_V1;
+ pub fn load_regional_indicator();
+}
+
+make_code_point_set_property! {
+ property: "Soft_Dotted";
+ marker: SoftDottedProperty;
+ keyed_data_marker: SoftDottedV1Marker;
+ func:
+ /// Characters with a "soft dot", like i or j. An accent placed on these characters causes
+ /// the dot to disappear.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let soft_dotted = sets::soft_dotted();
+ ///
+ /// assert!(soft_dotted.contains('і')); //U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
+ /// assert!(!soft_dotted.contains('ı')); // U+0131 LATIN SMALL LETTER DOTLESS I
+ /// ```
+
+ pub const fn soft_dotted() => SINGLETON_PROPS_SD_V1;
+ pub fn load_soft_dotted();
+}
+
+make_code_point_set_property! {
+ property: "Segment_Starter";
+ marker: SegmentStarterProperty;
+ keyed_data_marker: SegmentStarterV1Marker;
+ func:
+ /// Characters that are starters in terms of Unicode normalization and combining character
+ /// sequences
+
+ pub const fn segment_starter() => SINGLETON_PROPS_SEGSTART_V1;
+ pub fn load_segment_starter();
+}
+
+make_code_point_set_property! {
+ property: "Case_Sensitive";
+ marker: CaseSensitiveProperty;
+ keyed_data_marker: CaseSensitiveV1Marker;
+ func:
+ /// Characters that are either the source of a case mapping or in the target of a case
+ /// mapping
+
+ pub const fn case_sensitive() => SINGLETON_PROPS_SENSITIVE_V1;
+ pub fn load_case_sensitive();
+}
+
+make_code_point_set_property! {
+ property: "Sentence_Terminal";
+ marker: SentenceTerminalProperty;
+ keyed_data_marker: SentenceTerminalV1Marker;
+ func:
+ /// Punctuation characters that generally mark the end of sentences
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let sentence_terminal = sets::sentence_terminal();
+ ///
+ /// assert!(sentence_terminal.contains('.'));
+ /// assert!(sentence_terminal.contains('?'));
+ /// assert!(sentence_terminal.contains('᪨')); // U+1AA8 TAI THAM SIGN KAAN
+ /// assert!(!sentence_terminal.contains(','));
+ /// assert!(!sentence_terminal.contains('¿')); // U+00BF INVERTED QUESTION MARK
+ /// ```
+
+ pub const fn sentence_terminal() => SINGLETON_PROPS_STERM_V1;
+ pub fn load_sentence_terminal();
+}
+
+make_code_point_set_property! {
+ property: "Terminal_Punctuation";
+ marker: TerminalPunctuationProperty;
+ keyed_data_marker: TerminalPunctuationV1Marker;
+ func:
+ /// Punctuation characters that generally mark the end of textual units
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let terminal_punctuation = sets::terminal_punctuation();
+ ///
+ /// assert!(terminal_punctuation.contains('.'));
+ /// assert!(terminal_punctuation.contains('?'));
+ /// assert!(terminal_punctuation.contains('᪨')); // U+1AA8 TAI THAM SIGN KAAN
+ /// assert!(terminal_punctuation.contains(','));
+ /// assert!(!terminal_punctuation.contains('¿')); // U+00BF INVERTED QUESTION MARK
+ /// ```
+
+ pub const fn terminal_punctuation() => SINGLETON_PROPS_TERM_V1;
+ pub fn load_terminal_punctuation();
+}
+
+make_code_point_set_property! {
+ property: "Unified_Ideograph";
+ marker: UnifiedIdeographProperty;
+ keyed_data_marker: UnifiedIdeographV1Marker;
+ func:
+ /// A property which specifies the exact set of Unified CJK Ideographs in the standard
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let unified_ideograph = sets::unified_ideograph();
+ ///
+ /// assert!(unified_ideograph.contains('川')); // U+5DDD CJK UNIFIED IDEOGRAPH-5DDD
+ /// assert!(unified_ideograph.contains('木')); // U+6728 CJK UNIFIED IDEOGRAPH-6728
+ /// assert!(!unified_ideograph.contains('𛅸')); // U+1B178 NUSHU CHARACTER-1B178
+ /// ```
+
+ pub const fn unified_ideograph() => SINGLETON_PROPS_UIDEO_V1;
+ pub fn load_unified_ideograph();
+}
+
+make_code_point_set_property! {
+ property: "Uppercase";
+ marker: UppercaseProperty;
+ keyed_data_marker: UppercaseV1Marker;
+ func:
+ /// Uppercase characters
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let uppercase = sets::uppercase();
+ ///
+ /// assert!(uppercase.contains('U'));
+ /// assert!(!uppercase.contains('u'));
+ /// ```
+
+ pub const fn uppercase() => SINGLETON_PROPS_UPPER_V1;
+ pub fn load_uppercase();
+}
+
+make_code_point_set_property! {
+ property: "Variation_Selector";
+ marker: VariationSelectorProperty;
+ keyed_data_marker: VariationSelectorV1Marker;
+ func:
+ /// Characters that are Variation Selectors.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let variation_selector = sets::variation_selector();
+ ///
+ /// assert!(variation_selector.contains32(0x180D)); // MONGOLIAN FREE VARIATION SELECTOR THREE
+ /// assert!(!variation_selector.contains32(0x303E)); // IDEOGRAPHIC VARIATION INDICATOR
+ /// assert!(variation_selector.contains32(0xFE0F)); // VARIATION SELECTOR-16
+ /// assert!(!variation_selector.contains32(0xFE10)); // PRESENTATION FORM FOR VERTICAL COMMA
+ /// assert!(variation_selector.contains32(0xE01EF)); // VARIATION SELECTOR-256
+ /// ```
+
+ pub const fn variation_selector() => SINGLETON_PROPS_VS_V1;
+ pub fn load_variation_selector();
+}
+
+make_code_point_set_property! {
+ property: "White_Space";
+ marker: WhiteSpaceProperty;
+ keyed_data_marker: WhiteSpaceV1Marker;
+ func:
+ /// Spaces, separator characters and other control characters which should be treated by
+ /// programming languages as "white space" for the purpose of parsing elements
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let white_space = sets::white_space();
+ ///
+ /// assert!(white_space.contains(' '));
+ /// assert!(white_space.contains32(0x000A)); // NEW LINE
+ /// assert!(white_space.contains32(0x00A0)); // NO-BREAK SPACE
+ /// assert!(!white_space.contains32(0x200B)); // ZERO WIDTH SPACE
+ /// ```
+
+ pub const fn white_space() => SINGLETON_PROPS_WSPACE_V1;
+ pub fn load_white_space();
+}
+
+make_code_point_set_property! {
+ property: "Xdigit";
+ marker: XdigitProperty;
+ keyed_data_marker: XdigitV1Marker;
+ func:
+ /// Hexadecimal digits
+ /// This is defined for POSIX compatibility.
+
+ pub const fn xdigit() => SINGLETON_PROPS_XDIGIT_V1;
+ pub fn load_xdigit();
+}
+
+make_code_point_set_property! {
+ property: "XID_Continue";
+ marker: XidContinueProperty;
+ keyed_data_marker: XidContinueV1Marker;
+ func:
+ /// Characters that can come after the first character in an identifier. See [`Unicode Standard Annex
+ /// #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let xid_continue = sets::xid_continue();
+ ///
+ /// assert!(xid_continue.contains('x'));
+ /// assert!(xid_continue.contains('1'));
+ /// assert!(xid_continue.contains('_'));
+ /// assert!(xid_continue.contains('ߝ')); // U+07DD NKO LETTER FA
+ /// assert!(!xid_continue.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X
+ /// assert!(!xid_continue.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ /// ```
+
+ pub const fn xid_continue() => SINGLETON_PROPS_XIDC_V1;
+ pub fn load_xid_continue();
+}
+
+make_code_point_set_property! {
+ property: "XID_Start";
+ marker: XidStartProperty;
+ keyed_data_marker: XidStartV1Marker;
+ func:
+ /// Characters that can begin an identifier. See [`Unicode
+ /// Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more
+ /// details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let xid_start = sets::xid_start();
+ ///
+ /// assert!(xid_start.contains('x'));
+ /// assert!(!xid_start.contains('1'));
+ /// assert!(!xid_start.contains('_'));
+ /// assert!(xid_start.contains('ߝ')); // U+07DD NKO LETTER FA
+ /// assert!(!xid_start.contains('ⓧ')); // U+24E7 CIRCLED LATIN SMALL LETTER X
+ /// assert!(!xid_start.contains32(0xFC5E)); // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ /// ```
+
+ pub const fn xid_start() => SINGLETON_PROPS_XIDS_V1;
+ pub fn load_xid_start();
+}
+
+//
+// Binary property getter fns
+// (data as sets of strings + code points)
+//
+
+macro_rules! make_unicode_set_property {
+ (
+ // currently unused
+ property: $property:expr;
+ // currently unused
+ marker: $marker_name:ident;
+ keyed_data_marker: $keyed_data_marker:ty;
+ func:
+ $(#[$doc:meta])+
+ $cvis:vis const fn $constname:ident() => $singleton:ident;
+ $vis:vis fn $funcname:ident();
+ ) => {
+ #[doc = concat!("A version of [`", stringify!($constname), "()`] that uses custom data provided by a [`DataProvider`].")]
+ $vis fn $funcname(
+ provider: &(impl DataProvider<$keyed_data_marker> + ?Sized)
+ ) -> Result<UnicodeSetData, PropertiesError> {
+ Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(UnicodeSetData::from_data)?)
+ }
+ $(#[$doc])*
+ #[cfg(feature = "compiled_data")]
+ $cvis const fn $constname() -> UnicodeSetDataBorrowed<'static> {
+ UnicodeSetDataBorrowed {
+ set: crate::provider::Baked::$singleton
+ }
+ }
+ }
+}
+
+make_unicode_set_property! {
+ property: "Basic_Emoji";
+ marker: BasicEmojiProperty;
+ keyed_data_marker: BasicEmojiV1Marker;
+ func:
+ /// Characters and character sequences intended for general-purpose, independent, direct input.
+ /// See [`Unicode Technical Standard #51`](https://unicode.org/reports/tr51/) for more
+ /// details.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu_properties::sets;
+ ///
+ /// let basic_emoji = sets::basic_emoji();
+ ///
+ /// assert!(!basic_emoji.contains32(0x0020));
+ /// assert!(!basic_emoji.contains_char('\n'));
+ /// assert!(basic_emoji.contains_char('🦃')); // U+1F983 TURKEY
+ /// assert!(basic_emoji.contains("\u{1F983}"));
+ /// assert!(basic_emoji.contains("\u{1F6E4}\u{FE0F}")); // railway track
+ /// assert!(!basic_emoji.contains("\u{0033}\u{FE0F}\u{20E3}")); // Emoji_Keycap_Sequence, keycap 3
+ /// ```
+ pub const fn basic_emoji() => SINGLETON_PROPS_BASIC_EMOJI_V1;
+ pub fn load_basic_emoji();
+}
+
+//
+// Enumerated property getter fns
+//
+
+/// A version of [`for_general_category_group()`] that uses custom data provided by a [`DataProvider`].
+///
+/// [📚 Help choosing a constructor](icu_provider::constructors)
+pub fn load_for_general_category_group(
+ provider: &(impl DataProvider<GeneralCategoryV1Marker> + ?Sized),
+ enum_val: GeneralCategoryGroup,
+) -> Result<CodePointSetData, PropertiesError> {
+ let gc_map_payload = maps::load_general_category(provider)?;
+ let gc_map = gc_map_payload.as_borrowed();
+ let matching_gc_ranges = gc_map
+ .iter_ranges()
+ .filter(|cpm_range| (1 << cpm_range.value as u32) & enum_val.0 != 0)
+ .map(|cpm_range| cpm_range.range);
+ let set = CodePointInversionList::from_iter(matching_gc_ranges);
+ Ok(CodePointSetData::from_code_point_inversion_list(set))
+}
+
+/// Return a [`CodePointSetData`] for a value or a grouping of values of the General_Category property. See [`GeneralCategoryGroup`].
+///
+/// ✨ *Enabled with the `compiled_data` Cargo feature.*
+///
+/// [📚 Help choosing a constructor](icu_provider::constructors)
+#[cfg(feature = "compiled_data")]
+pub fn for_general_category_group(enum_val: GeneralCategoryGroup) -> CodePointSetData {
+ let matching_gc_ranges = maps::general_category()
+ .iter_ranges()
+ .filter(|cpm_range| (1 << cpm_range.value as u32) & enum_val.0 != 0)
+ .map(|cpm_range| cpm_range.range);
+ let set = CodePointInversionList::from_iter(matching_gc_ranges);
+ CodePointSetData::from_code_point_inversion_list(set)
+}
+
+/// Returns a type capable of looking up values for a property specified as a string, as long as it is a
+/// [binary property listed in ECMA-262][ecma], using strict matching on the names in the spec.
+///
+/// This handles every property required by ECMA-262 `/u` regular expressions, except for:
+///
+/// - `Script` and `General_Category`: handle these directly with [`maps::load_general_category()`] and
+/// [`maps::load_script()`].
+/// using property values parsed via [`GeneralCategory::get_name_to_enum_mapper()`] and [`Script::get_name_to_enum_mapper()`]
+/// if necessary.
+/// - `Script_Extensions`: handle this directly using APIs from [`crate::script`], like [`script::load_script_with_extensions_unstable()`]
+/// - `General_Category` mask values: Handle this alongside `General_Category` using [`GeneralCategoryGroup`],
+/// using property values parsed via [`GeneralCategoryGroup::get_name_to_enum_mapper()`] if necessary
+/// - `Assigned`, `All`, and `ASCII` pseudoproperties: Handle these using their equivalent sets:
+/// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]`
+/// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`).
+/// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]`
+/// - `General_Category` property values can themselves be treated like properties using a shorthand in ECMA262,
+/// simply create the corresponding `GeneralCategory` set.
+///
+/// ✨ *Enabled with the `compiled_data` Cargo feature.*
+///
+/// [📚 Help choosing a constructor](icu_provider::constructors)
+///
+/// ```
+/// use icu::properties::sets;
+///
+/// let emoji = sets::load_for_ecma262("Emoji").expect("loading data failed");
+///
+/// assert!(emoji.contains('🔥')); // U+1F525 FIRE
+/// assert!(!emoji.contains('V'));
+/// ```
+///
+/// [ecma]: https://tc39.es/ecma262/#table-binary-unicode-properties
+#[cfg(feature = "compiled_data")]
+pub fn load_for_ecma262(name: &str) -> Result<CodePointSetDataBorrowed<'static>, PropertiesError> {
+ use crate::runtime::UnicodeProperty;
+
+ let prop = if let Some(prop) = UnicodeProperty::parse_ecma262_name(name) {
+ prop
+ } else {
+ return Err(PropertiesError::UnexpectedPropertyName);
+ };
+ Ok(match prop {
+ UnicodeProperty::AsciiHexDigit => ascii_hex_digit(),
+ UnicodeProperty::Alphabetic => alphabetic(),
+ UnicodeProperty::BidiControl => bidi_control(),
+ UnicodeProperty::BidiMirrored => bidi_mirrored(),
+ UnicodeProperty::CaseIgnorable => case_ignorable(),
+ UnicodeProperty::Cased => cased(),
+ UnicodeProperty::ChangesWhenCasefolded => changes_when_casefolded(),
+ UnicodeProperty::ChangesWhenCasemapped => changes_when_casemapped(),
+ UnicodeProperty::ChangesWhenLowercased => changes_when_lowercased(),
+ UnicodeProperty::ChangesWhenNfkcCasefolded => changes_when_nfkc_casefolded(),
+ UnicodeProperty::ChangesWhenTitlecased => changes_when_titlecased(),
+ UnicodeProperty::ChangesWhenUppercased => changes_when_uppercased(),
+ UnicodeProperty::Dash => dash(),
+ UnicodeProperty::DefaultIgnorableCodePoint => default_ignorable_code_point(),
+ UnicodeProperty::Deprecated => deprecated(),
+ UnicodeProperty::Diacritic => diacritic(),
+ UnicodeProperty::Emoji => emoji(),
+ UnicodeProperty::EmojiComponent => emoji_component(),
+ UnicodeProperty::EmojiModifier => emoji_modifier(),
+ UnicodeProperty::EmojiModifierBase => emoji_modifier_base(),
+ UnicodeProperty::EmojiPresentation => emoji_presentation(),
+ UnicodeProperty::ExtendedPictographic => extended_pictographic(),
+ UnicodeProperty::Extender => extender(),
+ UnicodeProperty::GraphemeBase => grapheme_base(),
+ UnicodeProperty::GraphemeExtend => grapheme_extend(),
+ UnicodeProperty::HexDigit => hex_digit(),
+ UnicodeProperty::IdsBinaryOperator => ids_binary_operator(),
+ UnicodeProperty::IdsTrinaryOperator => ids_trinary_operator(),
+ UnicodeProperty::IdContinue => id_continue(),
+ UnicodeProperty::IdStart => id_start(),
+ UnicodeProperty::Ideographic => ideographic(),
+ UnicodeProperty::JoinControl => join_control(),
+ UnicodeProperty::LogicalOrderException => logical_order_exception(),
+ UnicodeProperty::Lowercase => lowercase(),
+ UnicodeProperty::Math => math(),
+ UnicodeProperty::NoncharacterCodePoint => noncharacter_code_point(),
+ UnicodeProperty::PatternSyntax => pattern_syntax(),
+ UnicodeProperty::PatternWhiteSpace => pattern_white_space(),
+ UnicodeProperty::QuotationMark => quotation_mark(),
+ UnicodeProperty::Radical => radical(),
+ UnicodeProperty::RegionalIndicator => regional_indicator(),
+ UnicodeProperty::SentenceTerminal => sentence_terminal(),
+ UnicodeProperty::SoftDotted => soft_dotted(),
+ UnicodeProperty::TerminalPunctuation => terminal_punctuation(),
+ UnicodeProperty::UnifiedIdeograph => unified_ideograph(),
+ UnicodeProperty::Uppercase => uppercase(),
+ UnicodeProperty::VariationSelector => variation_selector(),
+ UnicodeProperty::WhiteSpace => white_space(),
+ UnicodeProperty::XidContinue => xid_continue(),
+ UnicodeProperty::XidStart => xid_start(),
+ _ => return Err(PropertiesError::UnexpectedPropertyName),
+ })
+}
+
+icu_provider::gen_any_buffer_data_constructors!(
+ locale: skip,
+ name: &str,
+ result: Result<CodePointSetData, PropertiesError>,
+ #[cfg(skip)]
+ functions: [
+ load_for_ecma262,
+ load_for_ecma262_with_any_provider,
+ load_for_ecma262_with_buffer_provider,
+ load_for_ecma262_unstable,
+ ]
+);
+
+#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, load_for_ecma262)]
+pub fn load_for_ecma262_unstable<P>(
+ provider: &P,
+ name: &str,
+) -> Result<CodePointSetData, PropertiesError>
+where
+ P: ?Sized
+ + DataProvider<AsciiHexDigitV1Marker>
+ + DataProvider<AlphabeticV1Marker>
+ + DataProvider<BidiControlV1Marker>
+ + DataProvider<BidiMirroredV1Marker>
+ + DataProvider<CaseIgnorableV1Marker>
+ + DataProvider<CasedV1Marker>
+ + DataProvider<ChangesWhenCasefoldedV1Marker>
+ + DataProvider<ChangesWhenCasemappedV1Marker>
+ + DataProvider<ChangesWhenLowercasedV1Marker>
+ + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
+ + DataProvider<ChangesWhenTitlecasedV1Marker>
+ + DataProvider<ChangesWhenUppercasedV1Marker>
+ + DataProvider<DashV1Marker>
+ + DataProvider<DefaultIgnorableCodePointV1Marker>
+ + DataProvider<DeprecatedV1Marker>
+ + DataProvider<DiacriticV1Marker>
+ + DataProvider<EmojiV1Marker>
+ + DataProvider<EmojiComponentV1Marker>
+ + DataProvider<EmojiModifierV1Marker>
+ + DataProvider<EmojiModifierBaseV1Marker>
+ + DataProvider<EmojiPresentationV1Marker>
+ + DataProvider<ExtendedPictographicV1Marker>
+ + DataProvider<ExtenderV1Marker>
+ + DataProvider<GraphemeBaseV1Marker>
+ + DataProvider<GraphemeExtendV1Marker>
+ + DataProvider<HexDigitV1Marker>
+ + DataProvider<IdsBinaryOperatorV1Marker>
+ + DataProvider<IdsTrinaryOperatorV1Marker>
+ + DataProvider<IdContinueV1Marker>
+ + DataProvider<IdStartV1Marker>
+ + DataProvider<IdeographicV1Marker>
+ + DataProvider<JoinControlV1Marker>
+ + DataProvider<LogicalOrderExceptionV1Marker>
+ + DataProvider<LowercaseV1Marker>
+ + DataProvider<MathV1Marker>
+ + DataProvider<NoncharacterCodePointV1Marker>
+ + DataProvider<PatternSyntaxV1Marker>
+ + DataProvider<PatternWhiteSpaceV1Marker>
+ + DataProvider<QuotationMarkV1Marker>
+ + DataProvider<RadicalV1Marker>
+ + DataProvider<RegionalIndicatorV1Marker>
+ + DataProvider<SentenceTerminalV1Marker>
+ + DataProvider<SoftDottedV1Marker>
+ + DataProvider<TerminalPunctuationV1Marker>
+ + DataProvider<UnifiedIdeographV1Marker>
+ + DataProvider<UppercaseV1Marker>
+ + DataProvider<VariationSelectorV1Marker>
+ + DataProvider<WhiteSpaceV1Marker>
+ + DataProvider<XidContinueV1Marker>
+ + DataProvider<XidStartV1Marker>,
+{
+ use crate::runtime::UnicodeProperty;
+
+ let prop = if let Some(prop) = UnicodeProperty::parse_ecma262_name(name) {
+ prop
+ } else {
+ return Err(PropertiesError::UnexpectedPropertyName);
+ };
+ match prop {
+ UnicodeProperty::AsciiHexDigit => load_ascii_hex_digit(provider),
+ UnicodeProperty::Alphabetic => load_alphabetic(provider),
+ UnicodeProperty::BidiControl => load_bidi_control(provider),
+ UnicodeProperty::BidiMirrored => load_bidi_mirrored(provider),
+ UnicodeProperty::CaseIgnorable => load_case_ignorable(provider),
+ UnicodeProperty::Cased => load_cased(provider),
+ UnicodeProperty::ChangesWhenCasefolded => load_changes_when_casefolded(provider),
+ UnicodeProperty::ChangesWhenCasemapped => load_changes_when_casemapped(provider),
+ UnicodeProperty::ChangesWhenLowercased => load_changes_when_lowercased(provider),
+ UnicodeProperty::ChangesWhenNfkcCasefolded => load_changes_when_nfkc_casefolded(provider),
+ UnicodeProperty::ChangesWhenTitlecased => load_changes_when_titlecased(provider),
+ UnicodeProperty::ChangesWhenUppercased => load_changes_when_uppercased(provider),
+ UnicodeProperty::Dash => load_dash(provider),
+ UnicodeProperty::DefaultIgnorableCodePoint => load_default_ignorable_code_point(provider),
+ UnicodeProperty::Deprecated => load_deprecated(provider),
+ UnicodeProperty::Diacritic => load_diacritic(provider),
+ UnicodeProperty::Emoji => load_emoji(provider),
+ UnicodeProperty::EmojiComponent => load_emoji_component(provider),
+ UnicodeProperty::EmojiModifier => load_emoji_modifier(provider),
+ UnicodeProperty::EmojiModifierBase => load_emoji_modifier_base(provider),
+ UnicodeProperty::EmojiPresentation => load_emoji_presentation(provider),
+ UnicodeProperty::ExtendedPictographic => load_extended_pictographic(provider),
+ UnicodeProperty::Extender => load_extender(provider),
+ UnicodeProperty::GraphemeBase => load_grapheme_base(provider),
+ UnicodeProperty::GraphemeExtend => load_grapheme_extend(provider),
+ UnicodeProperty::HexDigit => load_hex_digit(provider),
+ UnicodeProperty::IdsBinaryOperator => load_ids_binary_operator(provider),
+ UnicodeProperty::IdsTrinaryOperator => load_ids_trinary_operator(provider),
+ UnicodeProperty::IdContinue => load_id_continue(provider),
+ UnicodeProperty::IdStart => load_id_start(provider),
+ UnicodeProperty::Ideographic => load_ideographic(provider),
+ UnicodeProperty::JoinControl => load_join_control(provider),
+ UnicodeProperty::LogicalOrderException => load_logical_order_exception(provider),
+ UnicodeProperty::Lowercase => load_lowercase(provider),
+ UnicodeProperty::Math => load_math(provider),
+ UnicodeProperty::NoncharacterCodePoint => load_noncharacter_code_point(provider),
+ UnicodeProperty::PatternSyntax => load_pattern_syntax(provider),
+ UnicodeProperty::PatternWhiteSpace => load_pattern_white_space(provider),
+ UnicodeProperty::QuotationMark => load_quotation_mark(provider),
+ UnicodeProperty::Radical => load_radical(provider),
+ UnicodeProperty::RegionalIndicator => load_regional_indicator(provider),
+ UnicodeProperty::SentenceTerminal => load_sentence_terminal(provider),
+ UnicodeProperty::SoftDotted => load_soft_dotted(provider),
+ UnicodeProperty::TerminalPunctuation => load_terminal_punctuation(provider),
+ UnicodeProperty::UnifiedIdeograph => load_unified_ideograph(provider),
+ UnicodeProperty::Uppercase => load_uppercase(provider),
+ UnicodeProperty::VariationSelector => load_variation_selector(provider),
+ UnicodeProperty::WhiteSpace => load_white_space(provider),
+ UnicodeProperty::XidContinue => load_xid_continue(provider),
+ UnicodeProperty::XidStart => load_xid_start(provider),
+ _ => Err(PropertiesError::UnexpectedPropertyName),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+
+ #[test]
+ fn test_general_category() {
+ use icu::properties::sets;
+ use icu::properties::GeneralCategoryGroup;
+
+ let digits_data = sets::for_general_category_group(GeneralCategoryGroup::Number);
+ let digits = digits_data.as_borrowed();
+
+ assert!(digits.contains('5'));
+ assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
+ assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
+
+ assert!(!digits.contains('A'));
+ }
+
+ #[test]
+ fn test_script() {
+ use icu::properties::maps;
+ use icu::properties::Script;
+
+ let thai_data = maps::script().get_set_for_value(Script::Thai);
+ let thai = thai_data.as_borrowed();
+
+ assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
+ assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
+
+ assert!(!thai.contains('A'));
+ assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
+ }
+
+ #[test]
+ fn test_gc_groupings() {
+ use icu::properties::{maps, sets};
+ use icu::properties::{GeneralCategory, GeneralCategoryGroup};
+ use icu_collections::codepointinvlist::CodePointInversionListBuilder;
+
+ let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
+ let category_set = sets::for_general_category_group(category);
+ let category_set = category_set
+ .as_code_point_inversion_list()
+ .expect("The data should be valid");
+
+ let mut builder = CodePointInversionListBuilder::new();
+ for subcategory in subcategories {
+ let gc_set_data = &maps::general_category().get_set_for_value(*subcategory);
+ let gc_set = gc_set_data.as_borrowed();
+ for range in gc_set.iter_ranges() {
+ builder.add_range_u32(&range);
+ }
+ }
+ let combined_set = builder.build();
+ println!("{category:?} {subcategories:?}");
+ assert_eq!(
+ category_set.get_inversion_list_vec(),
+ combined_set.get_inversion_list_vec()
+ );
+ };
+
+ test_group(
+ GeneralCategoryGroup::Letter,
+ &[
+ GeneralCategory::UppercaseLetter,
+ GeneralCategory::LowercaseLetter,
+ GeneralCategory::TitlecaseLetter,
+ GeneralCategory::ModifierLetter,
+ GeneralCategory::OtherLetter,
+ ],
+ );
+ test_group(
+ GeneralCategoryGroup::Other,
+ &[
+ GeneralCategory::Control,
+ GeneralCategory::Format,
+ GeneralCategory::Unassigned,
+ GeneralCategory::PrivateUse,
+ GeneralCategory::Surrogate,
+ ],
+ );
+ test_group(
+ GeneralCategoryGroup::Mark,
+ &[
+ GeneralCategory::SpacingMark,
+ GeneralCategory::EnclosingMark,
+ GeneralCategory::NonspacingMark,
+ ],
+ );
+ test_group(
+ GeneralCategoryGroup::Number,
+ &[
+ GeneralCategory::DecimalNumber,
+ GeneralCategory::LetterNumber,
+ GeneralCategory::OtherNumber,
+ ],
+ );
+ test_group(
+ GeneralCategoryGroup::Punctuation,
+ &[
+ GeneralCategory::ConnectorPunctuation,
+ GeneralCategory::DashPunctuation,
+ GeneralCategory::ClosePunctuation,
+ GeneralCategory::FinalPunctuation,
+ GeneralCategory::InitialPunctuation,
+ GeneralCategory::OtherPunctuation,
+ GeneralCategory::OpenPunctuation,
+ ],
+ );
+ test_group(
+ GeneralCategoryGroup::Symbol,
+ &[
+ GeneralCategory::CurrencySymbol,
+ GeneralCategory::ModifierSymbol,
+ GeneralCategory::MathSymbol,
+ GeneralCategory::OtherSymbol,
+ ],
+ );
+ test_group(
+ GeneralCategoryGroup::Separator,
+ &[
+ GeneralCategory::LineSeparator,
+ GeneralCategory::ParagraphSeparator,
+ GeneralCategory::SpaceSeparator,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_gc_surrogate() {
+ use icu::properties::maps;
+ use icu::properties::GeneralCategory;
+
+ let surrogates_data =
+ maps::general_category().get_set_for_value(GeneralCategory::Surrogate);
+ let surrogates = surrogates_data.as_borrowed();
+
+ assert!(surrogates.contains32(0xd800));
+ assert!(surrogates.contains32(0xd900));
+ assert!(surrogates.contains32(0xdfff));
+
+ assert!(!surrogates.contains('A'));
+ }
+}
diff --git a/third_party/rust/icu_properties/src/trievalue.rs b/third_party/rust/icu_properties/src/trievalue.rs
new file mode 100644
index 0000000000..d8b65e4aa9
--- /dev/null
+++ b/third_party/rust/icu_properties/src/trievalue.rs
@@ -0,0 +1,248 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::provider::bidi_data::{
+ CheckedBidiPairedBracketType, MirroredPairedBracketData, MirroredPairedBracketDataTryFromError,
+};
+use crate::script::ScriptWithExt;
+use crate::{
+ BidiClass, CanonicalCombiningClass, EastAsianWidth, GeneralCategory, GeneralCategoryGroup,
+ GraphemeClusterBreak, IndicSyllabicCategory, LineBreak, Script, SentenceBreak, WordBreak,
+};
+use core::convert::TryInto;
+use core::num::TryFromIntError;
+use zerovec::ule::{AsULE, RawBytesULE};
+
+use icu_collections::codepointtrie::TrieValue;
+
+use core::convert::TryFrom;
+
+impl TrieValue for CanonicalCombiningClass {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for BidiClass {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for GeneralCategory {
+ type TryFromU32Error = &'static str;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ // If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralCategory enum.
+ GeneralCategory::new_from_u8(i.try_into().unwrap_or(u8::MAX))
+ .ok_or("Cannot parse GeneralCategory from integer")
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self as u8)
+ }
+}
+
+impl TrieValue for Script {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u16::try_from(i).map(Script)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for ScriptWithExt {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u16::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for EastAsianWidth {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for LineBreak {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for GraphemeClusterBreak {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for WordBreak {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for SentenceBreak {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+impl TrieValue for CheckedBidiPairedBracketType {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ Ok(match i {
+ 1 => CheckedBidiPairedBracketType::Open,
+ 2 => CheckedBidiPairedBracketType::Close,
+ _ => CheckedBidiPairedBracketType::None,
+ })
+ }
+}
+
+impl TrieValue for IndicSyllabicCategory {
+ type TryFromU32Error = TryFromIntError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ u8::try_from(i).map(Self)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(self.0)
+ }
+}
+
+// GCG is not used inside tries, but it is used in the name lookup type, and we want
+// to squeeze it into a u16 for storage. Its named mask values are specced so we can
+// do this in code.
+//
+// This is done by:
+// - Single-value masks are translated to their corresponding GeneralCategory values
+// - we know all of the multi-value masks and we give them special values
+// - Anything else goes to 0xFF00, though this code path shouldn't be hit unless working with malformed icuexportdata
+//
+// In the reverse direction, unknown values go to the empty mask, but this codepath should not be hit except
+// with malformed ICU4X generated data.
+impl AsULE for GeneralCategoryGroup {
+ type ULE = RawBytesULE<2>;
+ fn to_unaligned(self) -> Self::ULE {
+ let value = gcg_to_packed_u16(self);
+ value.to_unaligned()
+ }
+ fn from_unaligned(ule: Self::ULE) -> Self {
+ let value = ule.as_unsigned_int();
+ packed_u16_to_gcg(value)
+ }
+}
+
+fn packed_u16_to_gcg(value: u16) -> GeneralCategoryGroup {
+ match value {
+ 0xFFFF => GeneralCategoryGroup::CasedLetter,
+ 0xFFFE => GeneralCategoryGroup::Letter,
+ 0xFFFD => GeneralCategoryGroup::Mark,
+ 0xFFFC => GeneralCategoryGroup::Number,
+ 0xFFFB => GeneralCategoryGroup::Separator,
+ 0xFFFA => GeneralCategoryGroup::Other,
+ 0xFFF9 => GeneralCategoryGroup::Punctuation,
+ 0xFFF8 => GeneralCategoryGroup::Symbol,
+ v if v < 32 => GeneralCategory::new_from_u8(v as u8)
+ .map(|gc| gc.into())
+ .unwrap_or(GeneralCategoryGroup(0)),
+ // unknown values produce an empty mask
+ _ => GeneralCategoryGroup(0),
+ }
+}
+
+fn gcg_to_packed_u16(gcg: GeneralCategoryGroup) -> u16 {
+ // if it's a single property, translate to that property
+ if gcg.0.count_ones() == 1 {
+ // inverse operation of a bitshift
+ gcg.0.trailing_zeros() as u16
+ } else {
+ match gcg {
+ GeneralCategoryGroup::CasedLetter => 0xFFFF,
+ GeneralCategoryGroup::Letter => 0xFFFE,
+ GeneralCategoryGroup::Mark => 0xFFFD,
+ GeneralCategoryGroup::Number => 0xFFFC,
+ GeneralCategoryGroup::Separator => 0xFFFB,
+ GeneralCategoryGroup::Other => 0xFFFA,
+ GeneralCategoryGroup::Punctuation => 0xFFF9,
+ GeneralCategoryGroup::Symbol => 0xFFF8,
+ _ => 0xFF00, // random sentinel value
+ }
+ }
+}
+
+impl TrieValue for GeneralCategoryGroup {
+ type TryFromU32Error = TryFromIntError;
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ // Even though we're dealing with u32s here, TrieValue is about converting
+ // trie storage types to the actual type. This type will always be a packed u16
+ // in our case since the names map upcasts from u16
+ u16::try_from(i).map(packed_u16_to_gcg)
+ }
+
+ fn to_u32(self) -> u32 {
+ u32::from(gcg_to_packed_u16(self))
+ }
+}
+
+impl TrieValue for MirroredPairedBracketData {
+ type TryFromU32Error = MirroredPairedBracketDataTryFromError;
+
+ fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
+ Self::try_from(i)
+ }
+}