From 64d98f8ee037282c35007b64c2649055c56af1db Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:19:03 +0200 Subject: Merging upstream version 1.68.2+dfsg1. Signed-off-by: Daniel Baumann --- vendor/pest/src/unicode/mod.rs | 259 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 244 insertions(+), 15 deletions(-) (limited to 'vendor/pest/src/unicode/mod.rs') diff --git a/vendor/pest/src/unicode/mod.rs b/vendor/pest/src/unicode/mod.rs index 912a7ecee..8a56bd6e8 100644 --- a/vendor/pest/src/unicode/mod.rs +++ b/vendor/pest/src/unicode/mod.rs @@ -7,24 +7,51 @@ use alloc::boxed::Box; +macro_rules! property_functions { + ($module:ident, $property_names:ident, [$( + $prop:ident, + )*]) => { + #[allow(unused)] + mod $module; + // unicode::ALPHABETIC('a') + $(pub fn $prop(c: char) -> bool { + self::$module::$prop.contains_char(c) + })* + + pub static $property_names: &[&str] = &[ + $(stringify!($prop),)* + ]; + }; +} + macro_rules! char_property_functions { + // For define custom property names {$( mod $module:ident; - [$( + static $property_names:ident = [$( $prop:ident, )*]; )*} => {$( - #[allow(unused)] - mod $module; - $(pub fn $prop(c: char) -> bool { - self::$module::$prop.contains_char(c) - })* + property_functions!($module, $property_names, [$( + $prop, + )*]); + )*}; + // For define property by copy BY_NAME values from `ucd-generate` generated. + {$( + mod $module:ident; + static $property_names:ident = [$( + ($_name:tt, $prop:ident), + )*]; + )*} => {$( + property_functions!($module, $property_names, [$( + $prop, + )*]); )*}; } char_property_functions! { mod binary; - [ + static BINARY_PROPERTY_NAMES = [ // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED, CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED, @@ -38,19 +65,215 @@ char_property_functions! { REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH, UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START, ]; +} +char_property_functions! { mod category; - [ - CASED_LETTER, CLOSE_PUNCTUATION, CONNECTOR_PUNCTUATION, CONTROL, CURRENCY_SYMBOL, - DASH_PUNCTUATION, DECIMAL_NUMBER, ENCLOSING_MARK, FINAL_PUNCTUATION, FORMAT, - INITIAL_PUNCTUATION, LETTER, LETTER_NUMBER, LINE_SEPARATOR, LOWERCASE_LETTER, MARK, - MATH_SYMBOL, MODIFIER_LETTER, MODIFIER_SYMBOL, NONSPACING_MARK, NUMBER, OPEN_PUNCTUATION, - OTHER, OTHER_LETTER, OTHER_NUMBER, OTHER_PUNCTUATION, OTHER_SYMBOL, PARAGRAPH_SEPARATOR, - PRIVATE_USE, PUNCTUATION, SEPARATOR, SPACE_SEPARATOR, SPACING_MARK, SURROGATE, SYMBOL, - TITLECASE_LETTER, UNASSIGNED, UPPERCASE_LETTER, + // Copy from category::BY_NAME + static CATEGORY_PROPERTY_NAMES = [ + ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK), + ("Surrogate", SURROGATE), ("Symbol", SYMBOL), + ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED), + ("Uppercase_Letter", UPPERCASE_LETTER), + ]; + + mod script; + // Copy from script::BY_NAME + static SCRIPT_PROPERTY_NAMES = [ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cypro_Minoan", CYPRO_MINOAN), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kawi", KAWI), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nag_Mundari", NAG_MUNDARI), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Old_Uyghur", OLD_UYGHUR), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangsa", TANGSA), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Toto", TOTO), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Vithkuqi", VITHKUQI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), ]; } +/// Return all available unicode property names +pub fn unicode_property_names() -> Box> { + Box::new( + BINARY_PROPERTY_NAMES + .iter() + .map(|name| *name) + .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name| *name)) + .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name| *name)), + ) +} + pub fn by_name(name: &str) -> Option bool>> { for property in binary::BY_NAME { if name == property.0.to_uppercase() { @@ -64,5 +287,11 @@ pub fn by_name(name: &str) -> Option bool>> { } } + for property in script::BY_NAME { + if name == property.0.to_uppercase() { + return Some(Box::new(move |c| property.1.contains_char(c))); + } + } + None } -- cgit v1.2.3