diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/icu_locid | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/icu_locid')
60 files changed, 8895 insertions, 0 deletions
diff --git a/third_party/rust/icu_locid/.cargo-checksum.json b/third_party/rust/icu_locid/.cargo-checksum.json new file mode 100644 index 0000000000..f281e7af64 --- /dev/null +++ b/third_party/rust/icu_locid/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.lock":"fa3d0827a7b2f8e928c86434cfa2f6b0f9e4aef207610f70fb500106afd58973","Cargo.toml":"def3e85ebc790882e4d0b39634ca78c0464e5a16f5a444881227c4c7b08ade1c","LICENSE":"853f87c96f3d249f200fec6db1114427bc8bdf4afddc93c576956d78152ce978","README.md":"27c4ba4df8933825ab38b7da3de3b678bee7f1b4a011c6268861da643ac52de2","benches/fixtures/langid.json":"373c11527653c63c685c9e229a8de5ae2b557c25b686a9d891c59e1f603232d8","benches/fixtures/locale.json":"669b19db933094290a45bf856559920f4e92401072e364ac82c482119dc9233a","benches/fixtures/mod.rs":"9a9671eddcf38a6faa10cb814949f8abc15d89f5e70f3ad6f684f1bc3ffe72ea","benches/fixtures/subtags.json":"28be3a639e452d713e807d5779b6819e06277e2dbbf67801ef34964fb9b074b6","benches/helpers/macros.rs":"bba0945a826bc083156bc302507c48c0c99c4d965e2a84352644d768591b0339","benches/helpers/mod.rs":"c98167d866fdb7f66c8cab41e8d57b5aab9e9707dfc66c37ef136e088dac6fef","benches/iai_langid.rs":"8e8f93e4b4e2e70771f86eccfaec8c38f2f8a79f569d72eef29a64bb730f3e0d","benches/langid.rs":"4e3d307d48fd9071308a567a0ef927b229814978abd2ba29f57c65edd51f38e4","benches/locale.rs":"b8d5b1e3f8b5578c549a5149229656fb60de26b76a1bf66b6c1abce75042d674","benches/subtags.rs":"e7e80dabaf31bf031779456614f139cafcdadb805986e71b49133ac964928432","examples/filter_langids.rs":"e000b860432b1646c74709177e4e4df1cfdc9620f41a677d15a5016bd7eb9b29","examples/syntatically_canonicalize_locales.rs":"49184e6e52e2264c61a479c0df88c01e7f7079f3da991445faecca3844594079","src/databake.rs":"894d0f73836d99915c5726d8984e7833498e841e2985cedfd5893aeb2bdcc958","src/extensions/mod.rs":"679a7df9291d4a86872a835288cb91b511ac9e5117af355a54ff2e44cbf242fd","src/extensions/other/mod.rs":"b126de24198275f847737f8d18715dc7276417f1028b14c84f970275d231c014","src/extensions/other/subtag.rs":"41e9e943d67d5940c1fa01d027e8713c5e95819e4542c4a7f8fbadc9fc47b085","src/extensions/private/mod.rs":"febf66bf7415d9789bf423aaf7ec45b4a228fca1dc4a1d4f955a68ad77773ff6","src/extensions/private/other.rs":"604341d3888b946102bbc1f9d6d1cc146a1aed26c55158385641f7a80a434b72","src/extensions/transform/fields.rs":"208f7783a49f53e0e120c51b8cdf8df43387e69d0f8cca0621802097b17c9094","src/extensions/transform/key.rs":"05ef426db886862257b4d8e11d0d7762694e858ed8e6e683e40a765be1d7f05b","src/extensions/transform/mod.rs":"5d753717c10f66929c4a1698606370fdeca8074adf4eac98243d665d72ccd838","src/extensions/transform/value.rs":"31f596b2f70fe19e42992e08dd0ca1130a4b89a41719983609ebf486fe8e0985","src/extensions/unicode/attribute.rs":"021115b6b1880048411dc6a983039dbf4cfce8eabf6895afc008356f13ced960","src/extensions/unicode/attributes.rs":"3b6c10548f78c5a1735d2c6b6c6b5cd9a11c7195843b2f3d71241e2931286412","src/extensions/unicode/key.rs":"3822a2710eeb6d8569666a0f4097cc0a85e5e490d8b7ff0b75a983e686cb26d3","src/extensions/unicode/keywords.rs":"bc33ab96f39d5c0d0b94ed076aec778ebb296a6ac14d141aee0ee3785c442c6d","src/extensions/unicode/mod.rs":"9aaa6e8a3b243d8398bc30d95be7eb003a82d64937979544e52287663196452b","src/extensions/unicode/value.rs":"b25db7ee38e42aa8473bdb7ee7b6ae48339f8f6de2a7f6bddc7d93804df91f39","src/helpers.rs":"8860167ebd2de94a977241efb0a3b60699db537fc64633318fba71c659adcce8","src/langid.rs":"ec8d94542a20b8efba78318aae0a2ec8bcb7d33791ed32aec845b9bc7dc6a146","src/lib.rs":"b0086f71477baa14abe327aece797f3a305ebc0b7cfc0fb7105f1a1dd64108ca","src/locale.rs":"51a28b67ac5df4261fee82501d5bd42e2621b328b84cf85cdddd7c5f1dadc707","src/macros.rs":"f7154fc103ea1120a55bb5898540b20df80de6eec42e70ce15f339d997f2bf52","src/ordering.rs":"d76c6f26ffb5eb1e24646b70ce532985af326c46c488abda52c89074387f1dcc","src/parser/errors.rs":"8af937e67d197272c0f4806cc40cb191c878085b8687f987e358ee01ac2b5004","src/parser/langid.rs":"282678684bf1530a92d5070cd02caef0e5a2797eeebb2a793febe5c74cb15d23","src/parser/locale.rs":"075c74803891894ad50bbedc69366931b8e76c0992b3caa1a5632f0a6816ccfd","src/parser/mod.rs":"5182392624876a419b1469d135d175aba680bb13d14e4f6ea0cfc4e071fbc743","src/serde.rs":"06e940e4f2d15f02d313b4e2b233aea3e74c93c6c43076f5ffe52d49c133608f","src/subtags/language.rs":"9d256e02908b57afdec69a453862af1a1267b04323b9522080e5dafc891a7a63","src/subtags/mod.rs":"a31350b679598b7452849ee6f8f56aefb2f2e370575ffe34dd643b62a0ca3fec","src/subtags/region.rs":"22a6dbe130899ebaab5482ca4f512de931eda1c5194639302995f696082344a2","src/subtags/script.rs":"1d57233bd238af03d4c347adb06b238bc108649da2bd395c03d6c1e879725f8a","src/subtags/variant.rs":"a0a14e1f26f1438b47e9419e696247f197c4a1042070a6e4a41620be720b5ac5","src/subtags/variants.rs":"33c9a8f979078461ae27b1812358a1c984594f4b5e66d2e5ba3ff649f89da7ba","src/zerovec.rs":"a34cfbce609b9ca711d22098a73f3cf17eec6ea73fd00f3106dca698b7dee29e","tests/fixtures/canonicalize.json":"c55a1cfa487f18d0019424802a9913fdb2349b286e6ca60e47d9d230c079f75a","tests/fixtures/invalid-extensions.json":"4b7888006360b216030597257de8c301e22877e75216818967bbd8c83b6dbb0b","tests/fixtures/invalid.json":"5247849a6eb805619b8e70254c855227f7bdaf71431b071c91c6cc378ae9766e","tests/fixtures/langid.json":"960fd01722217ef1ea9077e2e0821d7089fe318a241bd7fb7918f50bf8f3f5c3","tests/fixtures/locale.json":"df1b195b18780758a6b1c0264206b9cd9ac8c4741c5d6b0cc2b92f8e17991c17","tests/fixtures/mod.rs":"aea619960540b92199345cbd20ff03d2cb451aa2ce9aa6cf7915223ee9f812a3","tests/helpers/mod.rs":"d3bf59e7eed6230f340bef6c87a7b8de3a387ec391f60afc1b15a0d001cbfb67","tests/langid.rs":"82da0497c4b7d7c5d416ddb96bad0d13d4e51b735b5ed3164a25861de28e2118","tests/locale.rs":"25744873d84fdad4508af91acc51e9ca5971926afae8aaf9334f1618987987c1"},"package":"5c0aa2536adc14c07e2a521e95512b75ed8ef832f0fdf9299d4a0a45d2be2a9d"}
\ No newline at end of file diff --git a/third_party/rust/icu_locid/Cargo.lock b/third_party/rust/icu_locid/Cargo.lock new file mode 100644 index 0000000000..f1e1741ef6 --- /dev/null +++ b/third_party/rust/icu_locid/Cargo.lock @@ -0,0 +1,710 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bumpalo" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" + +[[package]] +name = "ciborium-ll" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "3.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" +dependencies = [ + "bitflags", + "clap_lex", + "indexmap", + "textwrap", +] + +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "cobs" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" + +[[package]] +name = "criterion" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +dependencies = [ + "anes", + "atty", + "cast", + "ciborium", + "clap", + "criterion-plot", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "databake" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82175d72e69414ceafbe2b49686794d3a8bed846e0d50267355f83ea8fdd953a" +dependencies = [ + "databake-derive", + "proc-macro2", + "quote", +] + +[[package]] +name = "databake-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "377af281d8f23663862a7c84623bc5dcf7f8c44b13c7496a590bdc157f941a43" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "displaydoc" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "iai" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71a816c97c42258aa5834d07590b718b4c9a598944cd39a52dc25b351185d678" + +[[package]] +name = "icu_locid" +version = "1.4.0" +dependencies = [ + "criterion", + "databake", + "displaydoc", + "iai", + "litemap", + "postcard", + "serde", + "serde_json", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "js-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.148" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" + +[[package]] +name = "litemap" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "memchr" +version = "2.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "os_str_bytes" +version = "6.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac" + +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "postcard" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d534c6e61df1c7166e636ca612d9820d486fe96ddad37f7abc671517b297488e" +dependencies = [ + "cobs", + "serde", +] + +[[package]] +name = "proc-macro2" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.188" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.188" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.107" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "285ba80e733fac80aa4270fbcdf83772a79b80aa35c97075320abfee4a915b06" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + +[[package]] +name = "textwrap" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" + +[[package]] +name = "tinystr" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83c02bf3c538ab32ba913408224323915f4ef9a6d61c0e85d493f355921c0ece" +dependencies = [ + "displaydoc", + "serde", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" + +[[package]] +name = "web-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "writeable" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dad7bb64b8ef9c0aa27b6da38b452b0ee9fd82beaf276a87dd796fb55cbae14e" + +[[package]] +name = "zerofrom" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "655b0814c5c0b19ade497851070c640773304939a6c0fd5f5fb43da0696d05b7" + +[[package]] +name = "zerovec" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eff4439ae91fb5c72b8abc12f3f2dbf51bd27e6eadb9f8a5bc8898dddb0e27ea" +dependencies = [ + "zerofrom", +] diff --git a/third_party/rust/icu_locid/Cargo.toml b/third_party/rust/icu_locid/Cargo.toml new file mode 100644 index 0000000000..8f8effbbd0 --- /dev/null +++ b/third_party/rust/icu_locid/Cargo.toml @@ -0,0 +1,134 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.67" +name = "icu_locid" +version = "1.4.0" +authors = ["The ICU4X Project Developers"] +include = [ + "data/**/*", + "src/**/*", + "examples/**/*", + "benches/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md", +] +description = "API for managing Unicode Language and Locale Identifiers" +homepage = "https://icu4x.unicode.org" +readme = "README.md" +categories = ["internationalization"] +license-file = "LICENSE" +repository = "https://github.com/unicode-org/icu4x" + +[package.metadata.cargo-all-features] +denylist = ["bench"] + +[package.metadata.docs.rs] +all-features = true + +[lib] +bench = false + +[[example]] +name = "filter_langids" +test = true + +[[example]] +name = "syntatically_canonicalize_locales" +test = true + +[[bench]] +name = "subtags" +harness = false +required-features = ["bench"] + +[[bench]] +name = "langid" +harness = false + +[[bench]] +name = "locale" +harness = false + +[[bench]] +name = "iai_langid" +harness = false +required-features = ["bench"] + +[dependencies.databake] +version = "0.1.7" +features = ["derive"] +optional = true +default-features = false + +[dependencies.displaydoc] +version = "0.2.3" +default-features = false + +[dependencies.litemap] +version = "0.7.2" +features = ["alloc"] +default-features = false + +[dependencies.serde] +version = "1.0" +features = [ + "alloc", + "derive", +] +optional = true +default-features = false + +[dependencies.tinystr] +version = "0.7.4" +features = ["alloc"] +default-features = false + +[dependencies.writeable] +version = "0.5.4" +default-features = false + +[dependencies.zerovec] +version = "0.10.1" +optional = true +default-features = false + +[dev-dependencies.iai] +version = "0.1.1" + +[dev-dependencies.postcard] +version = "1.0.0" +features = ["use-std"] +default-features = false + +[dev-dependencies.serde] +version = "1.0" +features = ["derive"] + +[dev-dependencies.serde_json] +version = "1.0" + +[features] +bench = ["serde"] +databake = ["dep:databake"] +serde = [ + "dep:serde", + "tinystr/serde", +] +std = [] +zerovec = ["dep:zerovec"] + +[target."cfg(not(target_arch = \"wasm32\"))".dev-dependencies.criterion] +version = "0.4" diff --git a/third_party/rust/icu_locid/LICENSE b/third_party/rust/icu_locid/LICENSE new file mode 100644 index 0000000000..9845aa5f48 --- /dev/null +++ b/third_party/rust/icu_locid/LICENSE @@ -0,0 +1,44 @@ +UNICODE LICENSE V3 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 2020-2023 Unicode, Inc. + +NOTICE TO USER: Carefully read the following legal agreement. BY +DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR +SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT +DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of data files and any associated documentation (the "Data Files") or +software and any associated documentation (the "Software") to deal in the +Data Files or Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Data Files or Software, and to permit persons to whom the +Data Files or Software are furnished to do so, provided that either (a) +this copyright and permission notice appear with all copies of the Data +Files or Software, or (b) this copyright and permission notice appear in +associated Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. + +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE +BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, +OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA +FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. + +— + +Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. +ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. diff --git a/third_party/rust/icu_locid/README.md b/third_party/rust/icu_locid/README.md new file mode 100644 index 0000000000..9469e9b3cf --- /dev/null +++ b/third_party/rust/icu_locid/README.md @@ -0,0 +1,53 @@ +# icu_locid [![crates.io](https://img.shields.io/crates/v/icu_locid)](https://crates.io/crates/icu_locid) + +<!-- cargo-rdme start --> + +Parsing, manipulating, and serializing Unicode Language and Locale Identifiers. + +This module is published as its own crate ([`icu_locid`](https://docs.rs/icu_locid/latest/icu_locid/)) +and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. + +The module provides algorithms for parsing a string into a well-formed language or locale identifier +as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]. + +[`Locale`] is the most common structure to use for storing information about a language, +script, region, variants and extensions. In almost all cases, this struct should be used as the +base unit for all locale management operations. + +[`LanguageIdentifier`] is a strict subset of [`Locale`] which can be useful in a narrow range of +cases where [`Unicode Extensions`] are not relevant. + +If in doubt, use [`Locale`]. + +## Examples + +```rust +use icu::locid::Locale; +use icu::locid::{ + locale, + subtags::{language, region}, +}; + +let mut loc: Locale = locale!("en-US"); + +assert_eq!(loc.id.language, language!("en")); +assert_eq!(loc.id.script, None); +assert_eq!(loc.id.region, Some(region!("US"))); +assert_eq!(loc.id.variants.len(), 0); + +loc.id.region = Some(region!("GB")); + +assert_eq!(loc, locale!("en-GB")); +``` + +For more details, see [`Locale`] and [`LanguageIdentifier`]. + +[`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]: https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers +[`ICU4X`]: ../icu/index.html +[`Unicode Extensions`]: extensions + +<!-- cargo-rdme end --> + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/third_party/rust/icu_locid/benches/fixtures/langid.json b/third_party/rust/icu_locid/benches/fixtures/langid.json new file mode 100644 index 0000000000..43c56d5a20 --- /dev/null +++ b/third_party/rust/icu_locid/benches/fixtures/langid.json @@ -0,0 +1,48 @@ +{ + "canonicalized": [ + "en-US", + "en-GB", + "es-AR", + "it", + "zh-Hans-CN", + "de-AT", + "pl", + "fr-FR", + "de-AT", + "sr-Cyrl-SR", + "nb-NO", + "fr-FR", + "mk", + "uk", + "en-US", + "en-GB", + "es-AR", + "th", + "de", + "zh-Cyrl-HN", + "en-Latn-US" + ], + "casing": [ + "En_uS", + "EN-GB", + "ES-aR", + "iT", + "zH_HaNs_cN", + "dE-aT", + "Pl", + "FR-FR", + "de_AT", + "sR-CyrL_sr", + "NB-NO", + "fr_fr", + "Mk", + "uK", + "en-us", + "en_gb", + "ES-AR", + "tH", + "DE", + "ZH_cyrl_hN", + "eN-lAtN-uS" + ] +} diff --git a/third_party/rust/icu_locid/benches/fixtures/locale.json b/third_party/rust/icu_locid/benches/fixtures/locale.json new file mode 100644 index 0000000000..f974a166ff --- /dev/null +++ b/third_party/rust/icu_locid/benches/fixtures/locale.json @@ -0,0 +1,26 @@ +{ + "canonicalized": [ + "en-US-u-hc-h12", + "en-GB-u-ca-gregory-hc-h12", + "es-AR-x-private", + "th-u-ca-buddhist", + "de-u-co-phonebk-ka-shifted", + "ar-u-nu-native", + "ar-u-nu-latn", + "ja-t-it", + "ja-Kana-t-it", + "und-Latn-t-und-cyrl" + ], + "casing": [ + "en-US-U-hc-h12", + "en-GB-u-CA-gregory-hc-h12", + "es-AR-x-Private", + "th-u-ca-buDDhist", + "de-u-co-phonebk-KA-shifted", + "AR_U-NU-native", + "ar-u-nu-LaTN", + "jA-T-it", + "ja-kanA-T-IT", + "unD-Latn-T-und-cyrl" + ] +} diff --git a/third_party/rust/icu_locid/benches/fixtures/mod.rs b/third_party/rust/icu_locid/benches/fixtures/mod.rs new file mode 100644 index 0000000000..006b223120 --- /dev/null +++ b/third_party/rust/icu_locid/benches/fixtures/mod.rs @@ -0,0 +1,25 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use serde::Deserialize; + +#[derive(Deserialize)] +pub struct SubtagData { + pub valid: Vec<String>, + pub invalid: Vec<String>, +} + +#[derive(Deserialize)] +pub struct Subtags { + pub language: SubtagData, + pub script: SubtagData, + pub region: SubtagData, + pub variant: SubtagData, +} + +#[derive(Deserialize)] +pub struct LocaleList { + pub canonicalized: Vec<String>, + pub casing: Vec<String>, +} diff --git a/third_party/rust/icu_locid/benches/fixtures/subtags.json b/third_party/rust/icu_locid/benches/fixtures/subtags.json new file mode 100644 index 0000000000..cf8419cc96 --- /dev/null +++ b/third_party/rust/icu_locid/benches/fixtures/subtags.json @@ -0,0 +1,18 @@ +{ + "language": { + "valid": ["en", "it", "pl", "de", "fr", "cs", "csb", "und", "ru", "nb", "NB", "UK", "pL", "Zh", "ES"], + "invalid": ["", "1", "$", "a1", "1211", "as_sa^a", "-0we", "3e3", "kk$$22", "testingaverylongstring"] + }, + "script": { + "valid": ["Latn", "latn", "Arab", "xxxx", "Flan", "fAlA", "oOoO", "pPlQ", "esta", "RUSS"], + "invalid": ["", "1", "$", "a1", "1211", "assaa", "-0we", "3e3", "kk$$22", "testingaverylongstring"] + }, + "region": { + "valid": ["DE", "321", "zh", "IA", "fN", "rU", "ru", "RU", "Ru", "CN", "AR"], + "invalid": ["", "1", "$", "a1", "1211", "assaa", "-0we", "3e3", "kk$$22", "testingaverylongstring"] + }, + "variant": { + "valid": ["macos", "MaCoS", "windows", "posix", "POSIX", "Posix", "linux", "lINUX", "mAcOs", "testing", "WWWWWW"], + "invalid": ["", "1", "$", "a1", "a211", "ass__aa", "-0we", "3e3", "kk$$22", "testingaverylongstring"] + } +} diff --git a/third_party/rust/icu_locid/benches/helpers/macros.rs b/third_party/rust/icu_locid/benches/helpers/macros.rs new file mode 100644 index 0000000000..848a360c41 --- /dev/null +++ b/third_party/rust/icu_locid/benches/helpers/macros.rs @@ -0,0 +1,110 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[macro_export] +macro_rules! overview { + ($c:expr, $struct:ident, $data_str:expr, $compare:expr) => { + $c.bench_function("overview", |b| { + b.iter(|| { + let mut values = vec![]; + for s in $data_str { + let value: Result<$struct, _> = black_box(s).parse(); + values.push(value.expect("Parsing failed")); + } + let _ = values + .iter() + .filter(|&v| v.normalizing_eq($compare)) + .count(); + + values + .iter() + .map(|v| v.to_string()) + .collect::<Vec<String>>() + }) + }); + }; +} + +#[macro_export] +macro_rules! construct { + ($c:expr, $struct:ident, $struct_name:expr, $data_str:expr) => { + $c.bench_function($struct_name, |b| { + b.iter(|| { + for s in $data_str { + let _: Result<$struct, _> = black_box(s).parse(); + } + }) + }); + }; +} + +#[macro_export] +macro_rules! to_string { + ($c:expr, $struct:ident, $struct_name:expr, $data:expr) => { + $c.bench_function($struct_name, |b| { + b.iter(|| { + for s in $data { + let _ = black_box(s).to_string(); + } + }) + }); + $c.bench_function(std::concat!($struct_name, "/writeable"), |b| { + use writeable::Writeable; + b.iter(|| { + for s in $data { + let _ = black_box(s).write_to_string(); + } + }) + }); + }; +} + +#[macro_export] +macro_rules! compare_struct { + ($c:expr, $struct:ident, $struct_name:expr, $data1:expr, $data2:expr) => { + $c.bench_function(BenchmarkId::new("struct", $struct_name), |b| { + b.iter(|| { + for (lid1, lid2) in $data1.iter().zip($data2.iter()) { + let _ = black_box(lid1) == black_box(lid2); + } + }) + }); + }; +} + +#[macro_export] +macro_rules! compare_str { + ($c:expr, $struct:ident, $struct_name:expr, $data1:expr, $data2:expr) => { + $c.bench_function(BenchmarkId::new("str", $struct_name), |b| { + b.iter(|| { + for (lid, s) in $data1.iter().zip($data2.iter()) { + let _ = black_box(lid).normalizing_eq(&black_box(s)); + } + }) + }); + $c.bench_function(BenchmarkId::new("strict_cmp", $struct_name), |b| { + b.iter(|| { + for (lid, s) in $data1.iter().zip($data2.iter()) { + let _ = black_box(lid).strict_cmp(&black_box(s).as_str().as_bytes()); + } + }) + }); + }; +} + +#[macro_export] +macro_rules! canonicalize { + ($c:expr, $struct:ident, $struct_name:expr, $data:expr) => { + $c.bench_function($struct_name, |b| { + b.iter(|| { + for s in $data { + let _ = black_box(s).to_string(); + } + for s in $data { + let _ = $struct::canonicalize(black_box(s)); + } + }) + }); + }; +} diff --git a/third_party/rust/icu_locid/benches/helpers/mod.rs b/third_party/rust/icu_locid/benches/helpers/mod.rs new file mode 100644 index 0000000000..27e455f7be --- /dev/null +++ b/third_party/rust/icu_locid/benches/helpers/mod.rs @@ -0,0 +1,17 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod macros; + +use std::fs::File; +use std::io::{BufReader, Error}; + +pub fn read_fixture<T>(path: &str) -> Result<T, Error> +where + T: serde::de::DeserializeOwned, +{ + let file = File::open(path)?; + let reader = BufReader::new(file); + Ok(serde_json::from_reader(reader)?) +} diff --git a/third_party/rust/icu_locid/benches/iai_langid.rs b/third_party/rust/icu_locid/benches/iai_langid.rs new file mode 100644 index 0000000000..979da2f0f0 --- /dev/null +++ b/third_party/rust/icu_locid/benches/iai_langid.rs @@ -0,0 +1,124 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_locid::{langid, subtags::language, subtags::region, LanguageIdentifier}; +use writeable::Writeable; + +const LIDS: &[LanguageIdentifier] = &[ + langid!("en"), + langid!("pl"), + langid!("fr-CA"), + langid!("zh-Hans"), + langid!("en-US"), + langid!("en-Latn-US"), + langid!("sr-Cyrl-BA"), +]; + +const LIDS_STR: &[&str] = &[ + "en", + "pl", + "fr-CA", + "zh-Hans", + "en-US", + "en-Latn-US", + "sr-Cyrl-BA", +]; + +fn bench_langid_constr() { + // Tests the instructions required to construct a LID from an str. + + let _: Vec<LanguageIdentifier> = LIDS_STR + .iter() + .map(|l| l.parse().expect("Failed to parse")) + .collect(); +} + +fn bench_langid_compare_components() { + // Tests the cost of comparing LID components. + + let result = LIDS + .iter() + .filter(|l| l.language == language!("en") && l.region == Some(region!("US"))) + .count(); + + assert_eq!(result, 2); +} + +fn bench_langid_compare_components_str() { + // Tests the cost of comparing LID components to str. + + let result = LIDS + .iter() + .filter(|l| { + l.language == language!("en") && l.region.map(|r| r == region!("US")).unwrap_or(false) + }) + .count(); + + assert_eq!(result, 2); +} + +fn bench_langid_strict_cmp() { + // Tests the cost of comparing a langid against byte strings. + use core::cmp::Ordering; + + let lid = langid!("en_us"); + + let result = LIDS_STR + .iter() + .filter(|s| lid.strict_cmp(s.as_bytes()) == Ordering::Equal) + .count(); + + assert_eq!(result, 1); +} + +fn bench_langid_matching() { + // Tests matching a LID against other LIDs. + + let lid = langid!("en_us"); + + let count = LIDS.iter().filter(|l| lid == **l).count(); + assert_eq!(count, 1); +} + +fn bench_langid_matching_str() { + // Tests matching a LID against list of str. + + let lid = langid!("en_us"); + + let count = LIDS_STR.iter().filter(|&l| lid.normalizing_eq(l)).count(); + assert_eq!(count, 1); +} + +fn bench_langid_serialize() { + // Tests serialization of LIDs. + + let _: Vec<String> = LIDS.iter().map(|l| l.to_string()).collect(); +} + +fn bench_langid_serialize_writeable() { + // Tests serialization of LIDs. + + let _: Vec<_> = LIDS.iter().map(|l| l.write_to_string()).collect(); +} + +fn bench_langid_canonicalize() { + // Tests canonicalization of strings. + + let _: Vec<String> = LIDS_STR + .iter() + .map(|l| LanguageIdentifier::canonicalize(l).expect("Canonicalization failed")) + .collect(); +} + +iai::main!( + bench_langid_constr, + bench_langid_compare_components, + bench_langid_compare_components_str, + bench_langid_strict_cmp, + bench_langid_matching, + bench_langid_matching_str, + bench_langid_serialize, + bench_langid_serialize_writeable, + bench_langid_canonicalize, +); diff --git a/third_party/rust/icu_locid/benches/langid.rs b/third_party/rust/icu_locid/benches/langid.rs new file mode 100644 index 0000000000..e5c9b67340 --- /dev/null +++ b/third_party/rust/icu_locid/benches/langid.rs @@ -0,0 +1,93 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod fixtures; +mod helpers; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use icu_locid::LanguageIdentifier; + +fn langid_benches(c: &mut Criterion) { + let path = "./benches/fixtures/langid.json"; + let data: fixtures::LocaleList = helpers::read_fixture(path).expect("Failed to read a fixture"); + + // Overview + { + let mut group = c.benchmark_group("langid"); + + overview!(group, LanguageIdentifier, &data.canonicalized, "en-US"); + + group.finish(); + } + + #[cfg(feature = "bench")] + { + use criterion::BenchmarkId; + + // Construct + { + let mut group = c.benchmark_group("langid/construct"); + + construct!(group, LanguageIdentifier, "langid", &data.canonicalized); + + group.finish(); + } + + // Stringify + { + let mut group = c.benchmark_group("langid/to_string"); + + let langids: Vec<LanguageIdentifier> = data + .canonicalized + .iter() + .map(|s| s.parse().unwrap()) + .collect(); + + to_string!(group, LanguageIdentifier, "langid", &langids); + + group.finish(); + } + + // Compare + { + let mut group = c.benchmark_group("langid/compare"); + + let langids: Vec<LanguageIdentifier> = data + .canonicalized + .iter() + .map(|s| s.parse().unwrap()) + .collect(); + let langids2: Vec<LanguageIdentifier> = data + .canonicalized + .iter() + .map(|s| s.parse().unwrap()) + .collect(); + + compare_struct!(group, LanguageIdentifier, "langid", &langids, &langids2); + + compare_str!( + group, + LanguageIdentifier, + "langid", + &langids, + &data.canonicalized + ); + + group.finish(); + } + + // Canonicalize + { + let mut group = c.benchmark_group("langid/canonicalize"); + + canonicalize!(group, LanguageIdentifier, "langid", &data.casing); + + group.finish(); + } + } +} + +criterion_group!(benches, langid_benches,); +criterion_main!(benches); diff --git a/third_party/rust/icu_locid/benches/locale.rs b/third_party/rust/icu_locid/benches/locale.rs new file mode 100644 index 0000000000..948fbb5e8e --- /dev/null +++ b/third_party/rust/icu_locid/benches/locale.rs @@ -0,0 +1,87 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod fixtures; +mod helpers; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use icu_locid::Locale; + +fn locale_benches(c: &mut Criterion) { + let path = "./benches/fixtures/locale.json"; + let data: fixtures::LocaleList = helpers::read_fixture(path).expect("Failed to read a fixture"); + + // Overview + { + let mut group = c.benchmark_group("locale"); + + overview!(group, Locale, &data.canonicalized, "en-US"); + + group.finish(); + } + + #[cfg(feature = "bench")] + { + use criterion::BenchmarkId; + + // Construct + { + let mut group = c.benchmark_group("locale/construct"); + + construct!(group, Locale, "locale", &data.canonicalized); + + group.finish(); + } + + // Stringify + { + let mut group = c.benchmark_group("locale/to_string"); + + let locales: Vec<Locale> = data + .canonicalized + .iter() + .map(|s| s.parse().unwrap()) + .collect(); + + to_string!(group, Locale, "locale", &locales); + + group.finish(); + } + + // Compare + { + let mut group = c.benchmark_group("locale/compare"); + + let locales: Vec<Locale> = data + .canonicalized + .iter() + .map(|s| s.parse().unwrap()) + .collect(); + let locales2: Vec<Locale> = data + .canonicalized + .iter() + .map(|s| s.parse().unwrap()) + .collect(); + + compare_struct!(group, Locale, "locale", &locales, &locales2); + + compare_str!(group, Locale, "locale", &locales, &data.canonicalized); + + group.finish(); + } + + // Canonicalize + { + let mut group = c.benchmark_group("locale/canonicalize"); + + canonicalize!(group, Locale, "locale", &data.casing); + + group.finish(); + } + } +} + +criterion_group!(benches, locale_benches,); +criterion_main!(benches); diff --git a/third_party/rust/icu_locid/benches/subtags.rs b/third_party/rust/icu_locid/benches/subtags.rs new file mode 100644 index 0000000000..4f81b71d2e --- /dev/null +++ b/third_party/rust/icu_locid/benches/subtags.rs @@ -0,0 +1,39 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod fixtures; +mod helpers; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use icu_locid::subtags::{Language, Region, Script, Variant}; +use icu_locid::ParserError; + +macro_rules! subtag_bench { + ($c:expr, $name:expr, $subtag:ident, $data:expr) => { + $c.bench_function(&format!("subtags/{}/parse", $name), |b| { + b.iter(|| { + for s in &$data.valid { + let _: $subtag = black_box(s).parse().unwrap(); + } + for s in &$data.invalid { + let _: ParserError = black_box(s).parse::<$subtag>().unwrap_err(); + } + }) + }); + }; +} + +fn subtags_bench(c: &mut Criterion) { + let path = "./benches/fixtures/subtags.json"; + let data: fixtures::Subtags = helpers::read_fixture(path).expect("Failed to read a fixture"); + + subtag_bench!(c, "language", Language, data.language); + subtag_bench!(c, "script", Script, data.script); + subtag_bench!(c, "region", Region, data.region); + subtag_bench!(c, "variant", Variant, data.variant); +} + +criterion_group!(benches, subtags_bench,); +criterion_main!(benches); diff --git a/third_party/rust/icu_locid/examples/filter_langids.rs b/third_party/rust/icu_locid/examples/filter_langids.rs new file mode 100644 index 0000000000..67828a1181 --- /dev/null +++ b/third_party/rust/icu_locid/examples/filter_langids.rs @@ -0,0 +1,69 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// A sample application which takes a comma separated list of language identifiers, +// filters out identifiers with language subtags different than `en` and serializes +// the list back into a comma separated list in canonical syntax. +// +// Note: This is an example of the API use, and is not a good base for language matching. +// For language matching, please consider algorithms such as Locale Matcher. + +#![no_main] // https://github.com/unicode-org/icu4x/issues/395 + +icu_benchmark_macros::static_setup!(); + +use std::env; + +use icu_locid::{subtags, LanguageIdentifier}; +use writeable::Writeable; + +const DEFAULT_INPUT: &str = + "de, en-us, zh-hant, sr-cyrl, fr-ca, es-cl, pl, en-latn-us, ca-valencia, und-arab"; + +fn filter_input(input: &str) -> String { + // 1. Parse the input string into a list of language identifiers. + let langids = input.split(',').filter_map(|s| s.trim().parse().ok()); + + // 2. Filter for LanguageIdentifiers with Language subtag `en`. + let en_lang: subtags::Language = "en".parse().expect("Failed to parse language subtag."); + + let en_langids = langids.filter(|langid: &LanguageIdentifier| langid.language == en_lang); + + // 3. Serialize the output. + let en_strs: Vec<String> = en_langids + .map(|langid| langid.write_to_string().into_owned()) + .collect(); + + en_strs.join(", ") +} + +#[no_mangle] +fn main(_argc: isize, _argv: *const *const u8) -> isize { + icu_benchmark_macros::main_setup!(); + let args: Vec<String> = env::args().collect(); + + let input = if let Some(input) = args.get(1) { + input.as_str() + } else { + DEFAULT_INPUT + }; + let _output = filter_input(input); + + #[cfg(debug_assertions)] + println!("\nInput: {input}\nOutput: {_output}"); + + 0 +} + +#[cfg(test)] +mod tests { + use super::*; + + const DEFAULT_OUTPUT: &str = "en-US, en-Latn-US"; + + #[test] + fn ensure_default_output() { + assert_eq!(filter_input(DEFAULT_INPUT), DEFAULT_OUTPUT); + } +} diff --git a/third_party/rust/icu_locid/examples/syntatically_canonicalize_locales.rs b/third_party/rust/icu_locid/examples/syntatically_canonicalize_locales.rs new file mode 100644 index 0000000000..1f967504eb --- /dev/null +++ b/third_party/rust/icu_locid/examples/syntatically_canonicalize_locales.rs @@ -0,0 +1,54 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// A sample application which takes a comma separated list of locales, +// makes them syntatically canonical and serializes the list back into a comma separated list. + +icu_benchmark_macros::static_setup!(); + +use std::env; + +use icu_locid::Locale; + +const DEFAULT_INPUT: &str = "sr-cyrL-rS, es-mx, und-arab-u-ca-Buddhist"; + +fn syntatically_canonicalize_locales(input: &str) -> String { + // Split input string and canonicalize each locale identifier. + let canonical_locales: Vec<String> = input + .split(',') + .filter_map(|s| Locale::canonicalize(s.trim()).ok()) + .collect(); + + canonical_locales.join(", ") +} + +fn main() { + icu_benchmark_macros::main_setup!(); + let args: Vec<String> = env::args().collect(); + + let input = if let Some(input) = args.get(1) { + input.as_str() + } else { + DEFAULT_INPUT + }; + let _output = syntatically_canonicalize_locales(input); + + #[cfg(debug_assertions)] + println!("\nInput: {input}\nOutput: {_output}"); +} + +#[cfg(test)] +mod tests { + use super::*; + + const DEFAULT_OUTPUT: &str = "sr-Cyrl-RS, es-MX, und-Arab-u-ca-buddhist"; + + #[test] + fn ensure_default_output() { + assert_eq!( + syntatically_canonicalize_locales(DEFAULT_INPUT), + DEFAULT_OUTPUT + ); + } +} diff --git a/third_party/rust/icu_locid/src/databake.rs b/third_party/rust/icu_locid/src/databake.rs new file mode 100644 index 0000000000..03b7357c5e --- /dev/null +++ b/third_party/rust/icu_locid/src/databake.rs @@ -0,0 +1,23 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::LanguageIdentifier; +use alloc::string::ToString; +use databake::*; + +impl Bake for LanguageIdentifier { + fn bake(&self, env: &CrateEnv) -> TokenStream { + env.insert("icu_locid"); + let repr = self.to_string(); + if self.variants.len() <= 1 { + quote! { + icu_locid::langid!(#repr) + } + } else { + quote! { + icu_locid::LanguageIdentifier::from_str(#repr).unwrap() + } + } + } +} diff --git a/third_party/rust/icu_locid/src/extensions/mod.rs b/third_party/rust/icu_locid/src/extensions/mod.rs new file mode 100644 index 0000000000..a37bf8b9fc --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/mod.rs @@ -0,0 +1,324 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with +//! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`] +//! is called [`Locale`]. +//! +//! There are four types of extensions: +//! +//! * [`Unicode Extensions`] - marked as `u`. +//! * [`Transform Extensions`] - marked as `t`. +//! * [`Private Use Extensions`] - marked as `x`. +//! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`. +//! +//! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`]. +//! +//! Notice: `Other` extension type is currently not supported. +//! +//! # Examples +//! +//! ``` +//! use icu::locid::extensions::unicode::{Key, Value}; +//! use icu::locid::Locale; +//! +//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo" +//! .parse() +//! .expect("Failed to parse."); +//! +//! assert_eq!(loc.id.language, "en".parse().unwrap()); +//! assert_eq!(loc.id.script, None); +//! assert_eq!(loc.id.region, Some("US".parse().unwrap())); +//! assert_eq!(loc.id.variants.len(), 0); +//! +//! let key: Key = "ca".parse().expect("Parsing key failed."); +//! let value: Value = "buddhist".parse().expect("Parsing value failed."); +//! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value)); +//! ``` +//! +//! [`LanguageIdentifier`]: super::LanguageIdentifier +//! [`Locale`]: super::Locale +//! [`subtags`]: super::subtags +//! [`Other Extensions`]: other +//! [`Private Use Extensions`]: private +//! [`Transform Extensions`]: transform +//! [`Unicode Extensions`]: unicode +pub mod other; +pub mod private; +pub mod transform; +pub mod unicode; + +use other::Other; +use private::Private; +use transform::Transform; +use unicode::Unicode; + +use alloc::vec::Vec; + +use crate::parser::ParserError; +use crate::parser::SubtagIterator; + +/// Defines the type of extension. +#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)] +#[non_exhaustive] +pub enum ExtensionType { + /// Transform Extension Type marked as `t`. + Transform, + /// Unicode Extension Type marked as `u`. + Unicode, + /// Private Extension Type marked as `x`. + Private, + /// All other extension types. + Other(u8), +} + +impl ExtensionType { + pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParserError> { + let key = key.to_ascii_lowercase(); + match key { + b'u' => Ok(Self::Unicode), + b't' => Ok(Self::Transform), + b'x' => Ok(Self::Private), + b'a'..=b'z' => Ok(Self::Other(key)), + _ => Err(ParserError::InvalidExtension), + } + } + + pub(crate) const fn try_from_bytes_manual_slice( + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Self, ParserError> { + if end - start != 1 { + return Err(ParserError::InvalidExtension); + } + #[allow(clippy::indexing_slicing)] + Self::try_from_byte(bytes[start]) + } +} + +/// A map of extensions associated with a given [`Locale`](crate::Locale). +#[derive(Debug, Default, PartialEq, Eq, Clone, Hash)] +#[non_exhaustive] +pub struct Extensions { + /// A representation of the data for a Unicode extension, when present in the locale identifier. + pub unicode: Unicode, + /// A representation of the data for a transform extension, when present in the locale identifier. + pub transform: Transform, + /// A representation of the data for a private-use extension, when present in the locale identifier. + pub private: Private, + /// A sequence of any other extensions that are present in the locale identifier but are not formally + /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`], + /// and [`Private`] are. + pub other: Vec<Other>, +} + +impl Extensions { + /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::Extensions; + /// + /// assert_eq!(Extensions::new(), Extensions::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self { + unicode: Unicode::new(), + transform: Transform::new(), + private: Private::new(), + other: Vec::new(), + } + } + + /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const` + /// context. + #[inline] + pub const fn from_unicode(unicode: Unicode) -> Self { + Self { + unicode, + transform: Transform::new(), + private: Private::new(), + other: Vec::new(), + } + } + + /// Returns whether there are no extensions present. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed."); + /// + /// assert!(!loc.extensions.is_empty()); + /// ``` + pub fn is_empty(&self) -> bool { + self.unicode.is_empty() + && self.transform.is_empty() + && self.private.is_empty() + && self.other.is_empty() + } + + /// Retains the specified extension types, clearing all others. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::ExtensionType; + /// use icu::locid::Locale; + /// + /// let loc: Locale = + /// "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap(); + /// + /// let mut only_unicode = loc.clone(); + /// only_unicode + /// .extensions + /// .retain_by_type(|t| t == ExtensionType::Unicode); + /// assert_eq!(only_unicode, "und-u-world".parse().unwrap()); + /// + /// let mut only_t_z = loc.clone(); + /// only_t_z.extensions.retain_by_type(|t| { + /// t == ExtensionType::Transform || t == ExtensionType::Other(b'z') + /// }); + /// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap()); + /// ``` + pub fn retain_by_type<F>(&mut self, mut predicate: F) + where + F: FnMut(ExtensionType) -> bool, + { + if !predicate(ExtensionType::Unicode) { + self.unicode.clear(); + } + if !predicate(ExtensionType::Transform) { + self.transform.clear(); + } + if !predicate(ExtensionType::Private) { + self.private.clear(); + } + self.other + .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte()))); + } + + pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { + let mut unicode = None; + let mut transform = None; + let mut private = None; + let mut other = Vec::new(); + + while let Some(subtag) = iter.next() { + if subtag.is_empty() { + return Err(ParserError::InvalidExtension); + } + match subtag.first().map(|b| ExtensionType::try_from_byte(*b)) { + Some(Ok(ExtensionType::Unicode)) => { + if unicode.is_some() { + return Err(ParserError::DuplicatedExtension); + } + unicode = Some(Unicode::try_from_iter(iter)?); + } + Some(Ok(ExtensionType::Transform)) => { + if transform.is_some() { + return Err(ParserError::DuplicatedExtension); + } + transform = Some(Transform::try_from_iter(iter)?); + } + Some(Ok(ExtensionType::Private)) => { + if private.is_some() { + return Err(ParserError::DuplicatedExtension); + } + private = Some(Private::try_from_iter(iter)?); + } + Some(Ok(ExtensionType::Other(ext))) => { + if other.iter().any(|o: &Other| o.get_ext_byte() == ext) { + return Err(ParserError::DuplicatedExtension); + } + let parsed = Other::try_from_iter(ext, iter)?; + if let Err(idx) = other.binary_search(&parsed) { + other.insert(idx, parsed); + } else { + return Err(ParserError::InvalidExtension); + } + } + _ => return Err(ParserError::InvalidExtension), + } + } + + Ok(Self { + unicode: unicode.unwrap_or_default(), + transform: transform.unwrap_or_default(), + private: private.unwrap_or_default(), + other, + }) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + let mut wrote_tu = false; + // Alphabetic by singleton + self.other.iter().try_for_each(|other| { + if other.get_ext() > 't' && !wrote_tu { + // Since 't' and 'u' are next to each other in alphabetical + // order, write both now. + self.transform.for_each_subtag_str(f)?; + self.unicode.for_each_subtag_str(f)?; + wrote_tu = true; + } + other.for_each_subtag_str(f)?; + Ok(()) + })?; + + if !wrote_tu { + self.transform.for_each_subtag_str(f)?; + self.unicode.for_each_subtag_str(f)?; + } + + // Private must be written last, since it allows single character + // keys. Extensions must also be written in alphabetical order, + // which would seem to imply that other extensions `y` and `z` are + // invalid, but this is not specified. + self.private.for_each_subtag_str(f)?; + Ok(()) + } +} + +impl_writeable_for_each_subtag_str_no_test!(Extensions); + +#[test] +fn test_writeable() { + use crate::Locale; + use writeable::assert_writeable_eq; + assert_writeable_eq!(Extensions::new(), ""); + assert_writeable_eq!( + "my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions, + "t-my-d0-zawgyi", + ); + assert_writeable_eq!( + "ar-SA-u-ca-islamic-civil" + .parse::<Locale>() + .unwrap() + .extensions, + "u-ca-islamic-civil", + ); + assert_writeable_eq!( + "en-001-x-foo-bar".parse::<Locale>().unwrap().extensions, + "x-foo-bar", + ); + assert_writeable_eq!( + "und-t-m0-true".parse::<Locale>().unwrap().extensions, + "t-m0-true", + ); + assert_writeable_eq!( + "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo" + .parse::<Locale>() + .unwrap() + .extensions, + "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo", + ); +} diff --git a/third_party/rust/icu_locid/src/extensions/other/mod.rs b/third_party/rust/icu_locid/src/extensions/other/mod.rs new file mode 100644 index 0000000000..933128739d --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/other/mod.rs @@ -0,0 +1,186 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Other Use Extensions is a list of extensions other than unicode, +//! transform or private. +//! +//! Those extensions are treated as a pass-through, and no Unicode related +//! behavior depends on them. +//! +//! The main struct for this extension is [`Other`] which is a list of [`Subtag`]s. +//! +//! # Examples +//! +//! ``` +//! use icu::locid::extensions::other::Other; +//! use icu::locid::Locale; +//! +//! let mut loc: Locale = "en-US-a-foo-faa".parse().expect("Parsing failed."); +//! ``` + +mod subtag; + +use crate::helpers::ShortSlice; +use crate::parser::ParserError; +use crate::parser::SubtagIterator; +use alloc::vec::Vec; +#[doc(inline)] +pub use subtag::{subtag, Subtag}; + +/// A list of [`Other Use Extensions`] as defined in [`Unicode Locale +/// Identifier`] specification. +/// +/// Those extensions are treated as a pass-through, and no Unicode related +/// behavior depends on them. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::other::{Other, Subtag}; +/// +/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag."); +/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag."); +/// +/// let other = Other::from_vec_unchecked(b'a', vec![subtag1, subtag2]); +/// assert_eq!(&other.to_string(), "a-foo-bar"); +/// ``` +/// +/// [`Other Use Extensions`]: https://unicode.org/reports/tr35/#other_extensions +/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier +#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)] +pub struct Other { + ext: u8, + keys: ShortSlice<Subtag>, +} + +impl Other { + /// A constructor which takes a pre-sorted list of [`Subtag`]. + /// + /// # Panics + /// + /// Panics if `ext` is not ASCII alphabetic. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::other::{Other, Subtag}; + /// + /// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag."); + /// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag."); + /// + /// let other = Other::from_vec_unchecked(b'a', vec![subtag1, subtag2]); + /// assert_eq!(&other.to_string(), "a-foo-bar"); + /// ``` + pub fn from_vec_unchecked(ext: u8, keys: Vec<Subtag>) -> Self { + Self::from_short_slice_unchecked(ext, keys.into()) + } + + pub(crate) fn from_short_slice_unchecked(ext: u8, keys: ShortSlice<Subtag>) -> Self { + assert!(ext.is_ascii_alphabetic()); + Self { ext, keys } + } + + pub(crate) fn try_from_iter(ext: u8, iter: &mut SubtagIterator) -> Result<Self, ParserError> { + debug_assert!(ext.is_ascii_alphabetic()); + + let mut keys = ShortSlice::new(); + while let Some(subtag) = iter.peek() { + if !Subtag::valid_key(subtag) { + break; + } + if let Ok(key) = Subtag::try_from_bytes(subtag) { + keys.push(key); + } + iter.next(); + } + + Ok(Self::from_short_slice_unchecked(ext, keys)) + } + + /// Gets the tag character for this extension as a &str. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let loc: Locale = "und-a-hello-world".parse().unwrap(); + /// let other_ext = &loc.extensions.other[0]; + /// assert_eq!(other_ext.get_ext_str(), "a"); + /// ``` + pub fn get_ext_str(&self) -> &str { + debug_assert!(self.ext.is_ascii_alphabetic()); + unsafe { core::str::from_utf8_unchecked(core::slice::from_ref(&self.ext)) } + } + + /// Gets the tag character for this extension as a char. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let loc: Locale = "und-a-hello-world".parse().unwrap(); + /// let other_ext = &loc.extensions.other[0]; + /// assert_eq!(other_ext.get_ext(), 'a'); + /// ``` + pub fn get_ext(&self) -> char { + self.ext as char + } + + /// Gets the tag character for this extension as a byte. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let loc: Locale = "und-a-hello-world".parse().unwrap(); + /// let other_ext = &loc.extensions.other[0]; + /// assert_eq!(other_ext.get_ext_byte(), b'a'); + /// ``` + pub fn get_ext_byte(&self) -> u8 { + self.ext + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + f(self.get_ext_str())?; + self.keys.iter().map(|t| t.as_str()).try_for_each(f) + } +} + +writeable::impl_display_with_writeable!(Other); + +impl writeable::Writeable for Other { + fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result { + sink.write_str(self.get_ext_str())?; + for key in self.keys.iter() { + sink.write_char('-')?; + writeable::Writeable::write_to(key, sink)?; + } + + Ok(()) + } + + fn writeable_length_hint(&self) -> writeable::LengthHint { + let mut result = writeable::LengthHint::exact(1); + for key in self.keys.iter() { + result += writeable::Writeable::writeable_length_hint(key) + 1; + } + result + } + + fn write_to_string(&self) -> alloc::borrow::Cow<str> { + if self.keys.is_empty() { + return alloc::borrow::Cow::Borrowed(self.get_ext_str()); + } + let mut string = + alloc::string::String::with_capacity(self.writeable_length_hint().capacity()); + let _ = self.write_to(&mut string); + alloc::borrow::Cow::Owned(string) + } +} diff --git a/third_party/rust/icu_locid/src/extensions/other/subtag.rs b/third_party/rust/icu_locid/src/extensions/other/subtag.rs new file mode 100644 index 0000000000..03be569406 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/other/subtag.rs @@ -0,0 +1,36 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A single item used in a list of [`Other`](super::Other) extensions. + /// + /// The subtag has to be an ASCII alphanumerical string no shorter than + /// two characters and no longer than eight. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::other::subtag; + /// + /// assert_eq!(subtag!("Foo").as_str(), "foo"); + /// ``` + Subtag, + extensions::other, + subtag, + extensions_other_subtag, + 2..=8, + s, + s.is_ascii_alphanumeric(), + s.to_ascii_lowercase(), + s.is_ascii_alphanumeric() && s.is_ascii_lowercase(), + InvalidExtension, + ["foo12"], + ["y", "toolooong"], +); + +impl Subtag { + pub(crate) const fn valid_key(v: &[u8]) -> bool { + 2 <= v.len() && v.len() <= 8 + } +} diff --git a/third_party/rust/icu_locid/src/extensions/private/mod.rs b/third_party/rust/icu_locid/src/extensions/private/mod.rs new file mode 100644 index 0000000000..5b41fdce09 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/private/mod.rs @@ -0,0 +1,187 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Private Use Extensions is a list of extensions intended for +//! private use. +//! +//! Those extensions are treated as a pass-through, and no Unicode related +//! behavior depends on them. +//! +//! The main struct for this extension is [`Private`] which is a list of [`Subtag`]s. +//! +//! # Examples +//! +//! ``` +//! use icu::locid::extensions::private::subtag; +//! use icu::locid::{locale, Locale}; +//! +//! let mut loc: Locale = "en-US-x-foo-faa".parse().expect("Parsing failed."); +//! +//! assert!(loc.extensions.private.contains(&subtag!("foo"))); +//! assert_eq!(loc.extensions.private.iter().next(), Some(&subtag!("foo"))); +//! +//! loc.extensions.private.clear(); +//! +//! assert!(loc.extensions.private.is_empty()); +//! assert_eq!(loc, locale!("en-US")); +//! ``` + +mod other; + +use alloc::vec::Vec; +use core::ops::Deref; + +#[doc(inline)] +pub use other::{subtag, Subtag}; + +use crate::helpers::ShortSlice; +use crate::parser::ParserError; +use crate::parser::SubtagIterator; + +/// A list of [`Private Use Extensions`] as defined in [`Unicode Locale +/// Identifier`] specification. +/// +/// Those extensions are treated as a pass-through, and no Unicode related +/// behavior depends on them. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::private::{Private, Subtag}; +/// +/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag."); +/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag."); +/// +/// let private = Private::from_vec_unchecked(vec![subtag1, subtag2]); +/// assert_eq!(&private.to_string(), "x-foo-bar"); +/// ``` +/// +/// [`Private Use Extensions`]: https://unicode.org/reports/tr35/#pu_extensions +/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier +#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)] +pub struct Private(ShortSlice<Subtag>); + +impl Private { + /// Returns a new empty list of private-use extensions. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::private::Private; + /// + /// assert_eq!(Private::new(), Private::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self(ShortSlice::new()) + } + + /// A constructor which takes a pre-sorted list of [`Subtag`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::private::{Private, Subtag}; + /// + /// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag."); + /// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag."); + /// + /// let private = Private::from_vec_unchecked(vec![subtag1, subtag2]); + /// assert_eq!(&private.to_string(), "x-foo-bar"); + /// ``` + pub fn from_vec_unchecked(input: Vec<Subtag>) -> Self { + Self(input.into()) + } + + /// A constructor which takes a single [`Subtag`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::private::{Private, Subtag}; + /// + /// let subtag: Subtag = "foo".parse().expect("Failed to parse a Subtag."); + /// + /// let private = Private::new_single(subtag); + /// assert_eq!(&private.to_string(), "x-foo"); + /// ``` + pub const fn new_single(input: Subtag) -> Self { + Self(ShortSlice::new_single(input)) + } + + /// Empties the [`Private`] list. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::private::{Private, Subtag}; + /// + /// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag."); + /// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag."); + /// let mut private = Private::from_vec_unchecked(vec![subtag1, subtag2]); + /// + /// assert_eq!(&private.to_string(), "x-foo-bar"); + /// + /// private.clear(); + /// + /// assert_eq!(private, Private::new()); + /// ``` + pub fn clear(&mut self) { + self.0.clear(); + } + + pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { + let keys = iter + .map(Subtag::try_from_bytes) + .collect::<Result<ShortSlice<_>, _>>()?; + + Ok(Self(keys)) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + if self.is_empty() { + return Ok(()); + } + f("x")?; + self.deref().iter().map(|t| t.as_str()).try_for_each(f) + } +} + +writeable::impl_display_with_writeable!(Private); + +impl writeable::Writeable for Private { + fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result { + if self.is_empty() { + return Ok(()); + } + sink.write_str("x")?; + for key in self.iter() { + sink.write_char('-')?; + writeable::Writeable::write_to(key, sink)?; + } + Ok(()) + } + + fn writeable_length_hint(&self) -> writeable::LengthHint { + if self.is_empty() { + return writeable::LengthHint::exact(0); + } + let mut result = writeable::LengthHint::exact(1); + for key in self.iter() { + result += writeable::Writeable::writeable_length_hint(key) + 1; + } + result + } +} + +impl Deref for Private { + type Target = [Subtag]; + + fn deref(&self) -> &Self::Target { + self.0.deref() + } +} diff --git a/third_party/rust/icu_locid/src/extensions/private/other.rs b/third_party/rust/icu_locid/src/extensions/private/other.rs new file mode 100644 index 0000000000..810ffa2f49 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/private/other.rs @@ -0,0 +1,32 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A single item used in a list of [`Private`](super::Private) extensions. + /// + /// The subtag has to be an ASCII alphanumerical string no shorter than + /// one character and no longer than eight. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::private::Subtag; + /// + /// let subtag1: Subtag = "Foo".parse().expect("Failed to parse a Subtag."); + /// + /// assert_eq!(subtag1.as_str(), "foo"); + /// ``` + Subtag, + extensions::private, + subtag, + extensions_private_subtag, + 1..=8, + s, + s.is_ascii_alphanumeric(), + s.to_ascii_lowercase(), + s.is_ascii_alphanumeric() && s.is_ascii_lowercase(), + InvalidExtension, + ["foo12"], + ["toolooong"], +); diff --git a/third_party/rust/icu_locid/src/extensions/transform/fields.rs b/third_party/rust/icu_locid/src/extensions/transform/fields.rs new file mode 100644 index 0000000000..2f12de9d14 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/transform/fields.rs @@ -0,0 +1,221 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::borrow::Borrow; +use core::iter::FromIterator; +use litemap::LiteMap; + +use super::Key; +use super::Value; + +/// A list of [`Key`]-[`Value`] pairs representing functional information +/// about content transformations. +/// +/// Here are examples of fields used in Unicode: +/// - `s0`, `d0` - Transform source/destination +/// - `t0` - Machine Translation +/// - `h0` - Hybrid Locale Identifiers +/// +/// You can find the full list in [`Unicode BCP 47 T Extension`] section of LDML. +/// +/// [`Unicode BCP 47 T Extension`]: https://unicode.org/reports/tr35/tr35.html#BCP47_T_Extension +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::transform::{key, Fields, Key, Value}; +/// +/// let value = "hybrid".parse::<Value>().expect("Failed to parse a Value."); +/// let fields = [(key!("h0"), value)].into_iter().collect::<Fields>(); +/// +/// assert_eq!(&fields.to_string(), "h0-hybrid"); +/// ``` +#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)] +pub struct Fields(LiteMap<Key, Value>); + +impl Fields { + /// Returns a new empty list of key-value pairs. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::Fields; + /// + /// assert_eq!(Fields::new(), Fields::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self(LiteMap::new()) + } + + /// Returns `true` if there are no fields. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::Fields; + /// use icu::locid::locale; + /// use icu::locid::Locale; + /// + /// let loc1 = Locale::try_from_bytes(b"und-t-h0-hybrid").unwrap(); + /// let loc2 = locale!("und-u-ca-buddhist"); + /// + /// assert!(!loc1.extensions.transform.fields.is_empty()); + /// assert!(loc2.extensions.transform.fields.is_empty()); + /// ``` + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Empties the [`Fields`] list. + /// + /// Returns the old list. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::{key, Fields, Value}; + /// + /// let value = "hybrid".parse::<Value>().expect("Failed to parse a Value."); + /// let mut fields = [(key!("h0"), value)].into_iter().collect::<Fields>(); + /// + /// assert_eq!(&fields.to_string(), "h0-hybrid"); + /// + /// fields.clear(); + /// + /// assert_eq!(fields, Fields::new()); + /// ``` + pub fn clear(&mut self) -> Self { + core::mem::take(self) + } + + /// Returns `true` if the list contains a [`Value`] for the specified [`Key`]. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::{Fields, Key, Value}; + /// + /// let key: Key = "h0".parse().expect("Failed to parse a Key."); + /// let value: Value = "hybrid".parse().expect("Failed to parse a Value."); + /// let mut fields = [(key, value)].into_iter().collect::<Fields>(); + /// + /// let key: Key = "h0".parse().expect("Failed to parse a Key."); + /// assert!(&fields.contains_key(&key)); + /// ``` + pub fn contains_key<Q>(&self, key: &Q) -> bool + where + Key: Borrow<Q>, + Q: Ord, + { + self.0.contains_key(key) + } + + /// Returns a reference to the [`Value`] corresponding to the [`Key`]. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::{key, Fields, Key, Value}; + /// + /// let value = "hybrid".parse::<Value>().unwrap(); + /// let fields = [(key!("h0"), value.clone())] + /// .into_iter() + /// .collect::<Fields>(); + /// + /// assert_eq!(fields.get(&key!("h0")), Some(&value)); + /// ``` + pub fn get<Q>(&self, key: &Q) -> Option<&Value> + where + Key: Borrow<Q>, + Q: Ord, + { + self.0.get(key) + } + + /// Sets the specified keyword, returning the old value if it already existed. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::{key, Key, Value}; + /// use icu::locid::Locale; + /// + /// let lower = "lower".parse::<Value>().expect("valid extension subtag"); + /// let casefold = "casefold".parse::<Value>().expect("valid extension subtag"); + /// + /// let mut loc: Locale = "en-t-hi-d0-casefold" + /// .parse() + /// .expect("valid BCP-47 identifier"); + /// let old_value = loc.extensions.transform.fields.set(key!("d0"), lower); + /// + /// assert_eq!(old_value, Some(casefold)); + /// assert_eq!(loc, "en-t-hi-d0-lower".parse().unwrap()); + /// ``` + pub fn set(&mut self, key: Key, value: Value) -> Option<Value> { + self.0.insert(key, value) + } + + /// Retains a subset of fields as specified by the predicate function. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::key; + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "und-t-h0-hybrid-d0-hex-m0-xml".parse().unwrap(); + /// + /// loc.extensions + /// .transform + /// .fields + /// .retain_by_key(|&k| k == key!("h0")); + /// assert_eq!(loc, "und-t-h0-hybrid".parse().unwrap()); + /// + /// loc.extensions + /// .transform + /// .fields + /// .retain_by_key(|&k| k == key!("d0")); + /// assert_eq!(loc, Locale::UND); + /// ``` + pub fn retain_by_key<F>(&mut self, mut predicate: F) + where + F: FnMut(&Key) -> bool, + { + self.0.retain(|k, _| predicate(k)) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + for (k, v) in self.0.iter() { + f(k.as_str())?; + v.for_each_subtag_str(f)?; + } + Ok(()) + } + + /// This needs to be its own method to help with type inference in helpers.rs + #[cfg(test)] + pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self { + v.into_iter().collect() + } +} + +impl From<LiteMap<Key, Value>> for Fields { + fn from(map: LiteMap<Key, Value>) -> Self { + Self(map) + } +} + +impl FromIterator<(Key, Value)> for Fields { + fn from_iter<I: IntoIterator<Item = (Key, Value)>>(iter: I) -> Self { + LiteMap::from_iter(iter).into() + } +} + +impl_writeable_for_key_value!(Fields, "h0", "hybrid", "m0", "m0-true"); diff --git a/third_party/rust/icu_locid/src/extensions/transform/key.rs b/third_party/rust/icu_locid/src/extensions/transform/key.rs new file mode 100644 index 0000000000..afdb31d760 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/transform/key.rs @@ -0,0 +1,32 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A key used in a list of [`Fields`](super::Fields). + /// + /// The key has to be a two ASCII characters long, with the first + /// character being alphabetic, and the second being a number. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::Key; + /// + /// let key1: Key = "k0".parse().expect("Failed to parse a Key."); + /// + /// assert_eq!(key1.as_str(), "k0"); + /// ``` + Key, + extensions::transform, + key, + extensions_transform_key, + 2..=2, + s, + s.all_bytes()[0].is_ascii_alphabetic() && s.all_bytes()[1].is_ascii_digit(), + s.to_ascii_lowercase(), + s.all_bytes()[0].is_ascii_lowercase() && s.all_bytes()[1].is_ascii_digit(), + InvalidExtension, + ["k0"], + ["", "k", "0k", "k12"], +); diff --git a/third_party/rust/icu_locid/src/extensions/transform/mod.rs b/third_party/rust/icu_locid/src/extensions/transform/mod.rs new file mode 100644 index 0000000000..f5bb74e0db --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/transform/mod.rs @@ -0,0 +1,237 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Transform Extensions provide information on content transformations in a given locale. +//! +//! The main struct for this extension is [`Transform`] which contains [`Fields`] and an +//! optional [`LanguageIdentifier`]. +//! +//! [`LanguageIdentifier`]: super::super::LanguageIdentifier +//! +//! # Examples +//! +//! ``` +//! use icu::locid::extensions::transform::{Fields, Key, Transform, Value}; +//! use icu::locid::{LanguageIdentifier, Locale}; +//! +//! let mut loc: Locale = +//! "en-US-t-es-ar-h0-hybrid".parse().expect("Parsing failed."); +//! +//! let lang: LanguageIdentifier = +//! "es-AR".parse().expect("Parsing LanguageIdentifier failed."); +//! +//! let key: Key = "h0".parse().expect("Parsing key failed."); +//! let value: Value = "hybrid".parse().expect("Parsing value failed."); +//! +//! assert_eq!(loc.extensions.transform.lang, Some(lang)); +//! assert!(loc.extensions.transform.fields.contains_key(&key)); +//! assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value)); +//! +//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-ar-h0-hybrid"); +//! ``` +mod fields; +mod key; +mod value; + +pub use fields::Fields; +#[doc(inline)] +pub use key::{key, Key}; +pub use value::Value; + +use crate::helpers::ShortSlice; +use crate::parser::SubtagIterator; +use crate::parser::{parse_language_identifier_from_iter, ParserError, ParserMode}; +use crate::subtags::Language; +use crate::LanguageIdentifier; +use litemap::LiteMap; + +/// A list of [`Unicode BCP47 T Extensions`] as defined in [`Unicode Locale +/// Identifier`] specification. +/// +/// Transform extension carries information about source language or script of +/// transformed content, including content that has been transliterated, transcribed, +/// or translated, or in some other way influenced by the source (See [`RFC 6497`] for details). +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::transform::{Key, Value}; +/// use icu::locid::{LanguageIdentifier, Locale}; +/// +/// let mut loc: Locale = +/// "de-t-en-us-h0-hybrid".parse().expect("Parsing failed."); +/// +/// let en_us: LanguageIdentifier = "en-US".parse().expect("Parsing failed."); +/// +/// assert_eq!(loc.extensions.transform.lang, Some(en_us)); +/// let key: Key = "h0".parse().expect("Parsing key failed."); +/// let value: Value = "hybrid".parse().expect("Parsing value failed."); +/// assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value)); +/// ``` +/// [`Unicode BCP47 T Extensions`]: https://unicode.org/reports/tr35/#t_Extension +/// [`RFC 6497`]: https://www.ietf.org/rfc/rfc6497.txt +/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier +#[derive(Clone, PartialEq, Eq, Debug, Default, Hash)] +#[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure +pub struct Transform { + /// The [`LanguageIdentifier`] specified with this locale extension, or `None` if not present. + pub lang: Option<LanguageIdentifier>, + /// The key-value pairs present in this locale extension, with each extension key subtag + /// associated to its provided value subtag. + pub fields: Fields, +} + +impl Transform { + /// Returns a new empty map of Transform extensions. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::Transform; + /// + /// assert_eq!(Transform::new(), Transform::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self { + lang: None, + fields: Fields::new(), + } + } + + /// Returns `true` if there are no tfields and no tlang in the `TransformExtensionList`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "en-US-t-es-ar".parse().expect("Parsing failed."); + /// + /// assert!(!loc.extensions.transform.is_empty()); + /// ``` + pub fn is_empty(&self) -> bool { + self.lang.is_none() && self.fields.is_empty() + } + + /// Clears the transform extension, effectively removing it from the locale. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "en-US-t-es-ar".parse().unwrap(); + /// loc.extensions.transform.clear(); + /// assert_eq!(loc, "en-US".parse().unwrap()); + /// ``` + pub fn clear(&mut self) { + self.lang = None; + self.fields.clear(); + } + + pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { + let mut tlang = None; + let mut tfields = LiteMap::new(); + + if let Some(subtag) = iter.peek() { + if Language::try_from_bytes(subtag).is_ok() { + tlang = Some(parse_language_identifier_from_iter( + iter, + ParserMode::Partial, + )?); + } + } + + let mut current_tkey = None; + let mut current_tvalue = ShortSlice::new(); + let mut has_current_tvalue = false; + + while let Some(subtag) = iter.peek() { + if let Some(tkey) = current_tkey { + if let Ok(val) = Value::parse_subtag(subtag) { + has_current_tvalue = true; + if let Some(val) = val { + current_tvalue.push(val); + } + } else { + if !has_current_tvalue { + return Err(ParserError::InvalidExtension); + } + tfields.try_insert(tkey, Value::from_short_slice_unchecked(current_tvalue)); + current_tkey = None; + current_tvalue = ShortSlice::new(); + has_current_tvalue = false; + continue; + } + } else if let Ok(tkey) = Key::try_from_bytes(subtag) { + current_tkey = Some(tkey); + } else { + break; + } + + iter.next(); + } + + if let Some(tkey) = current_tkey { + if !has_current_tvalue { + return Err(ParserError::InvalidExtension); + } + tfields.try_insert(tkey, Value::from_short_slice_unchecked(current_tvalue)); + } + + Ok(Self { + lang: tlang, + fields: tfields.into(), + }) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + if self.is_empty() { + return Ok(()); + } + f("t")?; + if let Some(lang) = &self.lang { + lang.for_each_subtag_str_lowercased(f)?; + } + self.fields.for_each_subtag_str(f) + } +} + +writeable::impl_display_with_writeable!(Transform); + +impl writeable::Writeable for Transform { + fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result { + if self.is_empty() { + return Ok(()); + } + sink.write_str("t")?; + if let Some(lang) = &self.lang { + sink.write_char('-')?; + lang.write_lowercased_to(sink)?; + } + if !self.fields.is_empty() { + sink.write_char('-')?; + writeable::Writeable::write_to(&self.fields, sink)?; + } + Ok(()) + } + + fn writeable_length_hint(&self) -> writeable::LengthHint { + if self.is_empty() { + return writeable::LengthHint::exact(0); + } + let mut result = writeable::LengthHint::exact(1); + if let Some(lang) = &self.lang { + result += writeable::Writeable::writeable_length_hint(lang) + 1; + } + if !self.fields.is_empty() { + result += writeable::Writeable::writeable_length_hint(&self.fields) + 1; + } + result + } +} diff --git a/third_party/rust/icu_locid/src/extensions/transform/value.rs b/third_party/rust/icu_locid/src/extensions/transform/value.rs new file mode 100644 index 0000000000..798e84793d --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/transform/value.rs @@ -0,0 +1,134 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::helpers::ShortSlice; +use crate::parser::{ParserError, SubtagIterator}; +use core::ops::RangeInclusive; +use core::str::FromStr; +use tinystr::TinyAsciiStr; + +/// A value used in a list of [`Fields`](super::Fields). +/// +/// The value has to be a sequence of one or more alphanumerical strings +/// separated by `-`. +/// Each part of the sequence has to be no shorter than three characters and no +/// longer than 8. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::transform::Value; +/// +/// "hybrid".parse::<Value>().expect("Valid Value."); +/// +/// "hybrid-foobar".parse::<Value>().expect("Valid Value."); +/// +/// "no".parse::<Value>().expect_err("Invalid Value."); +/// ``` +#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Default)] +pub struct Value(ShortSlice<TinyAsciiStr<{ *TYPE_LENGTH.end() }>>); + +const TYPE_LENGTH: RangeInclusive<usize> = 3..=8; +const TRUE_TVALUE: TinyAsciiStr<8> = tinystr::tinystr!(8, "true"); + +impl Value { + /// A constructor which takes a utf8 slice, parses it and + /// produces a well-formed [`Value`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::transform::Value; + /// + /// let value = Value::try_from_bytes(b"hybrid").expect("Parsing failed."); + /// ``` + pub fn try_from_bytes(input: &[u8]) -> Result<Self, ParserError> { + let mut v = ShortSlice::default(); + let mut has_value = false; + + for subtag in SubtagIterator::new(input) { + if !Self::is_type_subtag(subtag) { + return Err(ParserError::InvalidExtension); + } + has_value = true; + let val = + TinyAsciiStr::from_bytes(subtag).map_err(|_| ParserError::InvalidExtension)?; + if val != TRUE_TVALUE { + v.push(val); + } + } + + if !has_value { + return Err(ParserError::InvalidExtension); + } + Ok(Self(v)) + } + + pub(crate) fn from_short_slice_unchecked( + input: ShortSlice<TinyAsciiStr<{ *TYPE_LENGTH.end() }>>, + ) -> Self { + Self(input) + } + + pub(crate) fn is_type_subtag(t: &[u8]) -> bool { + TYPE_LENGTH.contains(&t.len()) && t.iter().all(u8::is_ascii_alphanumeric) + } + + pub(crate) fn parse_subtag( + t: &[u8], + ) -> Result<Option<TinyAsciiStr<{ *TYPE_LENGTH.end() }>>, ParserError> { + let s = TinyAsciiStr::from_bytes(t).map_err(|_| ParserError::InvalidSubtag)?; + if !TYPE_LENGTH.contains(&t.len()) || !s.is_ascii_alphanumeric() { + return Err(ParserError::InvalidExtension); + } + + let s = s.to_ascii_lowercase(); + + if s == TRUE_TVALUE { + Ok(None) + } else { + Ok(Some(s)) + } + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + if self.0.is_empty() { + f("true")?; + } else { + self.0.iter().map(TinyAsciiStr::as_str).try_for_each(f)?; + } + Ok(()) + } +} + +impl FromStr for Value { + type Err = ParserError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::try_from_bytes(source.as_bytes()) + } +} + +impl_writeable_for_each_subtag_str_no_test!(Value, selff, selff.0.is_empty() => alloc::borrow::Cow::Borrowed("true")); + +#[test] +fn test_writeable() { + use writeable::assert_writeable_eq; + + let hybrid = "hybrid".parse().unwrap(); + let foobar = "foobar".parse().unwrap(); + + assert_writeable_eq!(Value::default(), "true"); + assert_writeable_eq!( + Value::from_short_slice_unchecked(vec![hybrid].into()), + "hybrid" + ); + assert_writeable_eq!( + Value::from_short_slice_unchecked(vec![hybrid, foobar].into()), + "hybrid-foobar" + ); +} diff --git a/third_party/rust/icu_locid/src/extensions/unicode/attribute.rs b/third_party/rust/icu_locid/src/extensions/unicode/attribute.rs new file mode 100644 index 0000000000..f6fc53e057 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/unicode/attribute.rs @@ -0,0 +1,34 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// An attribute used in a set of [`Attributes`](super::Attributes). + /// + /// An attribute has to be a sequence of alphanumerical characters no + /// shorter than three and no longer than eight characters. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{attribute, Attribute}; + /// + /// let attr: Attribute = + /// "buddhist".parse().expect("Failed to parse an Attribute."); + /// + /// assert_eq!(attr, attribute!("buddhist")); + /// ``` + Attribute, + extensions::unicode, + attribute, + extensions_unicode_attribute, + 3..=8, + s, + s.is_ascii_alphanumeric(), + s.to_ascii_lowercase(), + s.is_ascii_alphanumeric() && s.is_ascii_lowercase(), + InvalidExtension, + ["foo12"], + ["no", "toolooong"], +); diff --git a/third_party/rust/icu_locid/src/extensions/unicode/attributes.rs b/third_party/rust/icu_locid/src/extensions/unicode/attributes.rs new file mode 100644 index 0000000000..1cdaded306 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/unicode/attributes.rs @@ -0,0 +1,120 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::Attribute; + +use crate::helpers::ShortSlice; +use alloc::vec::Vec; +use core::ops::Deref; + +/// A set of [`Attribute`] elements as defined in [`Unicode Extension Attributes`]. +/// +/// [`Unicode Extension Attributes`]: https://unicode.org/reports/tr35/tr35.html#u_Extension +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::unicode::{Attribute, Attributes}; +/// +/// let attribute1: Attribute = +/// "foobar".parse().expect("Failed to parse a variant subtag."); +/// +/// let attribute2: Attribute = "testing" +/// .parse() +/// .expect("Failed to parse a variant subtag."); +/// let mut v = vec![attribute1, attribute2]; +/// v.sort(); +/// v.dedup(); +/// +/// let attributes: Attributes = Attributes::from_vec_unchecked(v); +/// assert_eq!(attributes.to_string(), "foobar-testing"); +/// ``` +#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)] +pub struct Attributes(ShortSlice<Attribute>); + +impl Attributes { + /// Returns a new empty set of attributes. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Attributes; + /// + /// assert_eq!(Attributes::new(), Attributes::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self(ShortSlice::new()) + } + + /// A constructor which takes a pre-sorted list of [`Attribute`] elements. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{Attribute, Attributes}; + /// + /// let attribute1: Attribute = "foobar".parse().expect("Parsing failed."); + /// let attribute2: Attribute = "testing".parse().expect("Parsing failed."); + /// let mut v = vec![attribute1, attribute2]; + /// v.sort(); + /// v.dedup(); + /// + /// let attributes = Attributes::from_vec_unchecked(v); + /// ``` + /// + /// Notice: For performance- and memory-constrained environments, it is recommended + /// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort) + /// and [`dedup`](Vec::dedup()). + pub fn from_vec_unchecked(input: Vec<Attribute>) -> Self { + Self(input.into()) + } + + pub(crate) fn from_short_slice_unchecked(input: ShortSlice<Attribute>) -> Self { + Self(input) + } + + /// Empties the [`Attributes`] list. + /// + /// Returns the old list. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{attribute, Attribute, Attributes}; + /// use writeable::assert_writeable_eq; + /// + /// let mut attributes = Attributes::from_vec_unchecked(vec![ + /// attribute!("foobar"), + /// attribute!("testing"), + /// ]); + /// + /// assert_writeable_eq!(attributes, "foobar-testing"); + /// + /// attributes.clear(); + /// + /// assert_writeable_eq!(attributes, ""); + /// ``` + pub fn clear(&mut self) -> Self { + core::mem::take(self) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + self.deref().iter().map(|t| t.as_str()).try_for_each(f) + } +} + +impl_writeable_for_subtag_list!(Attributes, "foobar", "testing"); + +impl Deref for Attributes { + type Target = [Attribute]; + + fn deref(&self) -> &[Attribute] { + self.0.deref() + } +} diff --git a/third_party/rust/icu_locid/src/extensions/unicode/key.rs b/third_party/rust/icu_locid/src/extensions/unicode/key.rs new file mode 100644 index 0000000000..e008ffd5a8 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/unicode/key.rs @@ -0,0 +1,32 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A key used in a list of [`Keywords`](super::Keywords). + /// + /// The key has to be a two ASCII alphanumerical characters long, with the first + /// character being alphanumeric, and the second being alphabetic. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Key; + /// + /// assert!("ca".parse::<Key>().is_ok()); + /// ``` + Key, + extensions::unicode, + key, + extensions_unicode_key, + 2..=2, + s, + s.all_bytes()[0].is_ascii_alphanumeric() && s.all_bytes()[1].is_ascii_alphabetic(), + s.to_ascii_lowercase(), + (s.all_bytes()[0].is_ascii_lowercase() || s.all_bytes()[0].is_ascii_digit()) + && s.all_bytes()[1].is_ascii_lowercase(), + InvalidExtension, + ["ca", "8a"], + ["a", "a8", "abc"], +); diff --git a/third_party/rust/icu_locid/src/extensions/unicode/keywords.rs b/third_party/rust/icu_locid/src/extensions/unicode/keywords.rs new file mode 100644 index 0000000000..c2839fa44f --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/unicode/keywords.rs @@ -0,0 +1,393 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::borrow::Borrow; +use core::cmp::Ordering; +use core::iter::FromIterator; +use litemap::LiteMap; + +use super::Key; +use super::Value; +use crate::helpers::ShortSlice; +use crate::ordering::SubtagOrderingResult; + +/// A list of [`Key`]-[`Value`] pairs representing functional information +/// about locale's internationalization preferences. +/// +/// Here are examples of fields used in Unicode: +/// - `hc` - Hour Cycle (`h11`, `h12`, `h23`, `h24`) +/// - `ca` - Calendar (`buddhist`, `gregory`, ...) +/// - `fw` - First Day Of the Week (`sun`, `mon`, `sat`, ...) +/// +/// You can find the full list in [`Unicode BCP 47 U Extension`] section of LDML. +/// +/// [`Unicode BCP 47 U Extension`]: https://unicode.org/reports/tr35/tr35.html#Key_And_Type_Definitions_ +/// +/// # Examples +/// +/// Manually build up a [`Keywords`] object: +/// +/// ``` +/// use icu::locid::{ +/// extensions::unicode::{key, value, Keywords}, +/// locale, +/// }; +/// +/// let keywords = [(key!("hc"), value!("h23"))] +/// .into_iter() +/// .collect::<Keywords>(); +/// +/// assert_eq!(&keywords.to_string(), "hc-h23"); +/// ``` +/// +/// Access a [`Keywords`] object from a [`Locale`]: +/// +/// ``` +/// use icu::locid::{ +/// extensions::unicode::{key, value}, +/// Locale, +/// }; +/// +/// let loc: Locale = "und-u-hc-h23-kc-true".parse().expect("Valid BCP-47"); +/// +/// assert_eq!(loc.extensions.unicode.keywords.get(&key!("ca")), None); +/// assert_eq!( +/// loc.extensions.unicode.keywords.get(&key!("hc")), +/// Some(&value!("h23")) +/// ); +/// assert_eq!( +/// loc.extensions.unicode.keywords.get(&key!("kc")), +/// Some(&value!("true")) +/// ); +/// +/// assert_eq!(loc.extensions.unicode.keywords.to_string(), "hc-h23-kc"); +/// ``` +/// +/// [`Locale`]: crate::Locale +#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)] +pub struct Keywords(LiteMap<Key, Value, ShortSlice<(Key, Value)>>); + +impl Keywords { + /// Returns a new empty list of key-value pairs. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Keywords; + /// + /// assert_eq!(Keywords::new(), Keywords::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self(LiteMap::new()) + } + + /// Create a new list of key-value pairs having exactly one pair, callable in a `const` context. + #[inline] + pub const fn new_single(key: Key, value: Value) -> Self { + Self(LiteMap::from_sorted_store_unchecked( + ShortSlice::new_single((key, value)), + )) + } + + /// Returns `true` if there are no keywords. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Keywords; + /// use icu::locid::locale; + /// use icu::locid::Locale; + /// + /// let loc1 = Locale::try_from_bytes(b"und-t-h0-hybrid").unwrap(); + /// let loc2 = locale!("und-u-ca-buddhist"); + /// + /// assert!(loc1.extensions.unicode.keywords.is_empty()); + /// assert!(!loc2.extensions.unicode.keywords.is_empty()); + /// ``` + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns `true` if the list contains a [`Value`] for the specified [`Key`]. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{key, value, Keywords}; + /// + /// let keywords = [(key!("ca"), value!("gregory"))] + /// .into_iter() + /// .collect::<Keywords>(); + /// + /// assert!(&keywords.contains_key(&key!("ca"))); + /// ``` + pub fn contains_key<Q>(&self, key: &Q) -> bool + where + Key: Borrow<Q>, + Q: Ord, + { + self.0.contains_key(key) + } + + /// Returns a reference to the [`Value`] corresponding to the [`Key`]. + /// + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{key, value, Keywords}; + /// + /// let keywords = [(key!("ca"), value!("buddhist"))] + /// .into_iter() + /// .collect::<Keywords>(); + /// + /// assert_eq!(keywords.get(&key!("ca")), Some(&value!("buddhist"))); + /// ``` + pub fn get<Q>(&self, key: &Q) -> Option<&Value> + where + Key: Borrow<Q>, + Q: Ord, + { + self.0.get(key) + } + + /// Returns a mutable reference to the [`Value`] corresponding to the [`Key`]. + /// + /// Returns `None` if the key doesn't exist or if the key has no value. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{key, value, Keywords}; + /// + /// let mut keywords = [(key!("ca"), value!("buddhist"))] + /// .into_iter() + /// .collect::<Keywords>(); + /// + /// if let Some(value) = keywords.get_mut(&key!("ca")) { + /// *value = value!("gregory"); + /// } + /// assert_eq!(keywords.get(&key!("ca")), Some(&value!("gregory"))); + /// ``` + pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut Value> + where + Key: Borrow<Q>, + Q: Ord, + { + self.0.get_mut(key) + } + + /// Sets the specified keyword, returning the old value if it already existed. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Key; + /// use icu::locid::extensions::unicode::Value; + /// use icu::locid::extensions::unicode::{key, value}; + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12" + /// .parse() + /// .expect("valid BCP-47 identifier"); + /// let old_value = loc + /// .extensions + /// .unicode + /// .keywords + /// .set(key!("ca"), value!("japanese")); + /// + /// assert_eq!(old_value, Some(value!("buddhist"))); + /// assert_eq!(loc, "und-u-hello-ca-japanese-hc-h12".parse().unwrap()); + /// ``` + pub fn set(&mut self, key: Key, value: Value) -> Option<Value> { + self.0.insert(key, value) + } + + /// Removes the specified keyword, returning the old value if it existed. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::{key, Key}; + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12" + /// .parse() + /// .expect("valid BCP-47 identifier"); + /// loc.extensions.unicode.keywords.remove(key!("ca")); + /// assert_eq!(loc, "und-u-hello-hc-h12".parse().unwrap()); + /// ``` + pub fn remove<Q: Borrow<Key>>(&mut self, key: Q) -> Option<Value> { + self.0.remove(key.borrow()) + } + + /// Clears all Unicode extension keywords, leaving Unicode attributes. + /// + /// Returns the old Unicode extension keywords. + /// + /// # Example + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12".parse().unwrap(); + /// loc.extensions.unicode.keywords.clear(); + /// assert_eq!(loc, "und-u-hello".parse().unwrap()); + /// ``` + pub fn clear(&mut self) -> Self { + core::mem::take(self) + } + + /// Retains a subset of keywords as specified by the predicate function. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::key; + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = "und-u-ca-buddhist-hc-h12-ms-metric".parse().unwrap(); + /// + /// loc.extensions + /// .unicode + /// .keywords + /// .retain_by_key(|&k| k == key!("hc")); + /// assert_eq!(loc, "und-u-hc-h12".parse().unwrap()); + /// + /// loc.extensions + /// .unicode + /// .keywords + /// .retain_by_key(|&k| k == key!("ms")); + /// assert_eq!(loc, Locale::UND); + /// ``` + pub fn retain_by_key<F>(&mut self, mut predicate: F) + where + F: FnMut(&Key) -> bool, + { + self.0.retain(|k, _| predicate(k)) + } + + /// Compare this [`Keywords`] with BCP-47 bytes. + /// + /// The return value is equivalent to what would happen if you first converted this + /// [`Keywords`] to a BCP-47 string and then performed a byte comparison. + /// + /// This function is case-sensitive and results in a *total order*, so it is appropriate for + /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Keywords; + /// use icu::locid::Locale; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = + /// &["ca-hebrew", "ca-japanese", "ca-japanese-nu-latn", "nu-latn"]; + /// + /// for ab in bcp47_strings.windows(2) { + /// let a = ab[0]; + /// let b = ab[1]; + /// assert!(a.cmp(b) == Ordering::Less); + /// let a_kwds = format!("und-u-{}", a) + /// .parse::<Locale>() + /// .unwrap() + /// .extensions + /// .unicode + /// .keywords; + /// assert!(a_kwds.strict_cmp(a.as_bytes()) == Ordering::Equal); + /// assert!(a_kwds.strict_cmp(b.as_bytes()) == Ordering::Less); + /// } + /// ``` + pub fn strict_cmp(&self, other: &[u8]) -> Ordering { + self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + } + + /// Compare this [`Keywords`] with an iterator of BCP-47 subtags. + /// + /// This function has the same equality semantics as [`Keywords::strict_cmp`]. It is intended as + /// a more modular version that allows multiple subtag iterators to be chained together. + /// + /// For an additional example, see [`SubtagOrderingResult`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Keywords; + /// use icu::locid::locale; + /// use std::cmp::Ordering; + /// + /// let subtags: &[&[u8]] = &[b"ca", b"buddhist"]; + /// + /// let kwds = locale!("und-u-ca-buddhist").extensions.unicode.keywords; + /// assert_eq!( + /// Ordering::Equal, + /// kwds.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let kwds = locale!("und").extensions.unicode.keywords; + /// assert_eq!( + /// Ordering::Less, + /// kwds.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let kwds = locale!("und-u-nu-latn").extensions.unicode.keywords; + /// assert_eq!( + /// Ordering::Greater, + /// kwds.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// ``` + pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> + where + I: Iterator<Item = &'l [u8]>, + { + let r = self.for_each_subtag_str(&mut |subtag| { + if let Some(other) = subtags.next() { + match subtag.as_bytes().cmp(other) { + Ordering::Equal => Ok(()), + not_equal => Err(not_equal), + } + } else { + Err(Ordering::Greater) + } + }); + match r { + Ok(_) => SubtagOrderingResult::Subtags(subtags), + Err(o) => SubtagOrderingResult::Ordering(o), + } + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + for (k, v) in self.0.iter() { + f(k.as_str())?; + v.for_each_subtag_str(f)?; + } + Ok(()) + } + + /// This needs to be its own method to help with type inference in helpers.rs + #[cfg(test)] + pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self { + v.into_iter().collect() + } +} + +impl From<LiteMap<Key, Value, ShortSlice<(Key, Value)>>> for Keywords { + fn from(map: LiteMap<Key, Value, ShortSlice<(Key, Value)>>) -> Self { + Self(map) + } +} + +impl FromIterator<(Key, Value)> for Keywords { + fn from_iter<I: IntoIterator<Item = (Key, Value)>>(iter: I) -> Self { + LiteMap::from_iter(iter).into() + } +} + +impl_writeable_for_key_value!(Keywords, "ca", "islamic-civil", "mm", "mm"); diff --git a/third_party/rust/icu_locid/src/extensions/unicode/mod.rs b/third_party/rust/icu_locid/src/extensions/unicode/mod.rs new file mode 100644 index 0000000000..95f1a2d781 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/unicode/mod.rs @@ -0,0 +1,237 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Unicode Extensions provide information about user preferences in a given locale. +//! +//! The main struct for this extension is [`Unicode`] which contains [`Keywords`] and +//! [`Attributes`]. +//! +//! +//! # Examples +//! +//! ``` +//! use icu::locid::extensions::unicode::{attribute, key, value, Unicode}; +//! use icu::locid::Locale; +//! +//! let loc: Locale = "en-US-u-foobar-hc-h12".parse().expect("Parsing failed."); +//! +//! assert_eq!( +//! loc.extensions.unicode.keywords.get(&key!("hc")), +//! Some(&value!("h12")) +//! ); +//! assert!(loc +//! .extensions +//! .unicode +//! .attributes +//! .contains(&attribute!("foobar"))); +//! ``` +mod attribute; +mod attributes; +mod key; +mod keywords; +mod value; + +#[doc(inline)] +pub use attribute::{attribute, Attribute}; +pub use attributes::Attributes; +#[doc(inline)] +pub use key::{key, Key}; +pub use keywords::Keywords; +#[doc(inline)] +pub use value::{value, Value}; + +use crate::helpers::ShortSlice; +use crate::parser::ParserError; +use crate::parser::SubtagIterator; +use litemap::LiteMap; + +/// Unicode Extensions provide information about user preferences in a given locale. +/// +/// A list of [`Unicode BCP47 U Extensions`] as defined in [`Unicode Locale +/// Identifier`] specification. +/// +/// Unicode extensions provide subtags that specify language and/or locale-based behavior +/// or refinements to language tags, according to work done by the Unicode Consortium. +/// (See [`RFC 6067`] for details). +/// +/// [`Unicode BCP47 U Extensions`]: https://unicode.org/reports/tr35/#u_Extension +/// [`RFC 6067`]: https://www.ietf.org/rfc/rfc6067.txt +/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::unicode::{key, value}; +/// use icu::locid::Locale; +/// +/// let loc: Locale = +/// "de-u-hc-h12-ca-buddhist".parse().expect("Parsing failed."); +/// +/// assert_eq!( +/// loc.extensions.unicode.keywords.get(&key!("ca")), +/// Some(&value!("buddhist")) +/// ); +/// ``` +#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)] +#[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure +pub struct Unicode { + /// The key-value pairs present in this locale extension, with each extension key subtag + /// associated to its provided value subtag. + pub keywords: Keywords, + /// A canonically ordered sequence of single standalone subtags for this locale extension. + pub attributes: Attributes, +} + +impl Unicode { + /// Returns a new empty map of Unicode extensions. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Unicode; + /// + /// assert_eq!(Unicode::new(), Unicode::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self { + keywords: Keywords::new(), + attributes: Attributes::new(), + } + } + + /// Returns [`true`] if there list of keywords and attributes is empty. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed."); + /// + /// assert!(!loc.extensions.unicode.is_empty()); + /// ``` + pub fn is_empty(&self) -> bool { + self.keywords.is_empty() && self.attributes.is_empty() + } + + /// Clears all Unicode extension keywords and attributes, effectively removing + /// the Unicode extension. + /// + /// # Example + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let mut loc: Locale = + /// "und-t-mul-u-hello-ca-buddhist-hc-h12".parse().unwrap(); + /// loc.extensions.unicode.clear(); + /// assert_eq!(loc, "und-t-mul".parse().unwrap()); + /// ``` + pub fn clear(&mut self) { + self.keywords.clear(); + self.attributes.clear(); + } + + pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParserError> { + let mut attributes = ShortSlice::new(); + + while let Some(subtag) = iter.peek() { + if let Ok(attr) = Attribute::try_from_bytes(subtag) { + if let Err(idx) = attributes.binary_search(&attr) { + attributes.insert(idx, attr); + } + } else { + break; + } + iter.next(); + } + + let mut keywords = LiteMap::new(); + + let mut current_keyword = None; + let mut current_value = ShortSlice::new(); + + while let Some(subtag) = iter.peek() { + let slen = subtag.len(); + if slen == 2 { + if let Some(kw) = current_keyword.take() { + keywords.try_insert(kw, Value::from_short_slice_unchecked(current_value)); + current_value = ShortSlice::new(); + } + current_keyword = Some(Key::try_from_bytes(subtag)?); + } else if current_keyword.is_some() { + match Value::parse_subtag(subtag) { + Ok(Some(t)) => current_value.push(t), + Ok(None) => {} + Err(_) => break, + } + } else { + break; + } + iter.next(); + } + + if let Some(kw) = current_keyword.take() { + keywords.try_insert(kw, Value::from_short_slice_unchecked(current_value)); + } + + // Ensure we've defined at least one attribute or keyword + if attributes.is_empty() && keywords.is_empty() { + return Err(ParserError::InvalidExtension); + } + + Ok(Self { + keywords: keywords.into(), + attributes: Attributes::from_short_slice_unchecked(attributes), + }) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + if self.is_empty() { + return Ok(()); + } + f("u")?; + self.attributes.for_each_subtag_str(f)?; + self.keywords.for_each_subtag_str(f)?; + Ok(()) + } +} + +writeable::impl_display_with_writeable!(Unicode); + +impl writeable::Writeable for Unicode { + fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result { + if self.is_empty() { + return Ok(()); + } + sink.write_str("u")?; + if !self.attributes.is_empty() { + sink.write_char('-')?; + writeable::Writeable::write_to(&self.attributes, sink)?; + } + if !self.keywords.is_empty() { + sink.write_char('-')?; + writeable::Writeable::write_to(&self.keywords, sink)?; + } + Ok(()) + } + + fn writeable_length_hint(&self) -> writeable::LengthHint { + if self.is_empty() { + return writeable::LengthHint::exact(0); + } + let mut result = writeable::LengthHint::exact(1); + if !self.attributes.is_empty() { + result += writeable::Writeable::writeable_length_hint(&self.attributes) + 1; + } + if !self.keywords.is_empty() { + result += writeable::Writeable::writeable_length_hint(&self.keywords) + 1; + } + result + } +} diff --git a/third_party/rust/icu_locid/src/extensions/unicode/value.rs b/third_party/rust/icu_locid/src/extensions/unicode/value.rs new file mode 100644 index 0000000000..d935656a97 --- /dev/null +++ b/third_party/rust/icu_locid/src/extensions/unicode/value.rs @@ -0,0 +1,196 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::helpers::ShortSlice; +use crate::parser::{ParserError, SubtagIterator}; +use core::ops::RangeInclusive; +use core::str::FromStr; +use tinystr::TinyAsciiStr; + +/// A value used in a list of [`Keywords`](super::Keywords). +/// +/// The value has to be a sequence of one or more alphanumerical strings +/// separated by `-`. +/// Each part of the sequence has to be no shorter than three characters and no +/// longer than 8. +/// +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::unicode::{value, Value}; +/// use writeable::assert_writeable_eq; +/// +/// assert_writeable_eq!(value!("gregory"), "gregory"); +/// assert_writeable_eq!( +/// "islamic-civil".parse::<Value>().unwrap(), +/// "islamic-civil" +/// ); +/// +/// // The value "true" has the special, empty string representation +/// assert_eq!(value!("true").to_string(), ""); +/// ``` +#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Default)] +pub struct Value(ShortSlice<TinyAsciiStr<{ *VALUE_LENGTH.end() }>>); + +const VALUE_LENGTH: RangeInclusive<usize> = 3..=8; +const TRUE_VALUE: TinyAsciiStr<8> = tinystr::tinystr!(8, "true"); + +impl Value { + /// A constructor which takes a utf8 slice, parses it and + /// produces a well-formed [`Value`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Value; + /// + /// Value::try_from_bytes(b"buddhist").expect("Parsing failed."); + /// ``` + pub fn try_from_bytes(input: &[u8]) -> Result<Self, ParserError> { + let mut v = ShortSlice::new(); + + if !input.is_empty() { + for subtag in SubtagIterator::new(input) { + let val = Self::subtag_from_bytes(subtag)?; + if let Some(val) = val { + v.push(val); + } + } + } + Ok(Self(v)) + } + + /// Const constructor for when the value contains only a single subtag. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Value; + /// + /// Value::try_from_single_subtag(b"buddhist").expect("valid subtag"); + /// Value::try_from_single_subtag(b"#####").expect_err("invalid subtag"); + /// Value::try_from_single_subtag(b"foo-bar").expect_err("not a single subtag"); + /// ``` + pub const fn try_from_single_subtag(subtag: &[u8]) -> Result<Self, ParserError> { + match Self::subtag_from_bytes(subtag) { + Err(_) => Err(ParserError::InvalidExtension), + Ok(option) => Ok(Self::from_tinystr(option)), + } + } + + #[doc(hidden)] + pub fn as_tinystr_slice(&self) -> &[TinyAsciiStr<8>] { + &self.0 + } + + #[doc(hidden)] + pub const fn as_single_subtag(&self) -> Option<&TinyAsciiStr<8>> { + self.0.single() + } + + #[doc(hidden)] + pub const fn from_tinystr(subtag: Option<TinyAsciiStr<8>>) -> Self { + match subtag { + None => Self(ShortSlice::new()), + Some(val) => { + debug_assert!(val.is_ascii_alphanumeric()); + debug_assert!(!matches!(val, TRUE_VALUE)); + Self(ShortSlice::new_single(val)) + } + } + } + + pub(crate) fn from_short_slice_unchecked(input: ShortSlice<TinyAsciiStr<8>>) -> Self { + Self(input) + } + + #[doc(hidden)] + pub const fn subtag_from_bytes(bytes: &[u8]) -> Result<Option<TinyAsciiStr<8>>, ParserError> { + Self::parse_subtag_from_bytes_manual_slice(bytes, 0, bytes.len()) + } + + pub(crate) fn parse_subtag(t: &[u8]) -> Result<Option<TinyAsciiStr<8>>, ParserError> { + Self::parse_subtag_from_bytes_manual_slice(t, 0, t.len()) + } + + pub(crate) const fn parse_subtag_from_bytes_manual_slice( + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<TinyAsciiStr<8>>, ParserError> { + let slice_len = end - start; + if slice_len > *VALUE_LENGTH.end() || slice_len < *VALUE_LENGTH.start() { + return Err(ParserError::InvalidExtension); + } + + match TinyAsciiStr::from_bytes_manual_slice(bytes, start, end) { + Ok(TRUE_VALUE) => Ok(None), + Ok(s) if s.is_ascii_alphanumeric() => Ok(Some(s.to_ascii_lowercase())), + Ok(_) => Err(ParserError::InvalidExtension), + Err(_) => Err(ParserError::InvalidSubtag), + } + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + self.0.iter().map(TinyAsciiStr::as_str).try_for_each(f) + } +} + +impl FromStr for Value { + type Err = ParserError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::try_from_bytes(source.as_bytes()) + } +} + +impl_writeable_for_subtag_list!(Value, "islamic", "civil"); + +/// A macro allowing for compile-time construction of valid Unicode [`Value`] subtag. +/// +/// The macro only supports single-subtag values. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::unicode::{key, value}; +/// use icu::locid::Locale; +/// +/// let loc: Locale = "de-u-ca-buddhist".parse().unwrap(); +/// +/// assert_eq!( +/// loc.extensions.unicode.keywords.get(&key!("ca")), +/// Some(&value!("buddhist")) +/// ); +/// ``` +/// +/// [`Value`]: crate::extensions::unicode::Value +#[macro_export] +#[doc(hidden)] +macro_rules! extensions_unicode_value { + ($value:literal) => {{ + // What we want: + // const R: $crate::extensions::unicode::Value = + // match $crate::extensions::unicode::Value::try_from_single_subtag($value.as_bytes()) { + // Ok(r) => r, + // #[allow(clippy::panic)] // const context + // _ => panic!(concat!("Invalid Unicode extension value: ", $value)), + // }; + // Workaround until https://github.com/rust-lang/rust/issues/73255 lands: + const R: $crate::extensions::unicode::Value = + $crate::extensions::unicode::Value::from_tinystr( + match $crate::extensions::unicode::Value::subtag_from_bytes($value.as_bytes()) { + Ok(r) => r, + _ => panic!(concat!("Invalid Unicode extension value: ", $value)), + }, + ); + R + }}; +} +#[doc(inline)] +pub use extensions_unicode_value as value; diff --git a/third_party/rust/icu_locid/src/helpers.rs b/third_party/rust/icu_locid/src/helpers.rs new file mode 100644 index 0000000000..d12435fbf3 --- /dev/null +++ b/third_party/rust/icu_locid/src/helpers.rs @@ -0,0 +1,698 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::iter::FromIterator; + +use alloc::boxed::Box; +use alloc::vec; +use alloc::vec::Vec; +use core::ops::{Deref, DerefMut}; +use litemap::store::*; + +/// Internal: A vector that supports no-allocation, constant values if length 0 or 1. +/// Using ZeroOne(Option<T>) saves 8 bytes in ShortSlice via niche optimization. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) enum ShortSlice<T> { + ZeroOne(Option<T>), + Multi(Box<[T]>), +} + +impl<T> ShortSlice<T> { + #[inline] + pub const fn new() -> Self { + Self::ZeroOne(None) + } + + #[inline] + pub const fn new_single(item: T) -> Self { + Self::ZeroOne(Some(item)) + } + + pub fn push(&mut self, item: T) { + *self = match core::mem::replace(self, Self::ZeroOne(None)) { + ShortSlice::ZeroOne(None) => ShortSlice::ZeroOne(Some(item)), + ShortSlice::ZeroOne(Some(prev_item)) => { + ShortSlice::Multi(vec![prev_item, item].into_boxed_slice()) + } + ShortSlice::Multi(items) => { + let mut items = items.into_vec(); + items.push(item); + ShortSlice::Multi(items.into_boxed_slice()) + } + }; + } + + #[inline] + pub const fn single(&self) -> Option<&T> { + match self { + ShortSlice::ZeroOne(Some(v)) => Some(v), + _ => None, + } + } + + #[inline] + pub fn len(&self) -> usize { + match self { + ShortSlice::ZeroOne(None) => 0, + ShortSlice::ZeroOne(_) => 1, + ShortSlice::Multi(ref v) => v.len(), + } + } + + pub fn insert(&mut self, index: usize, elt: T) { + assert!( + index <= self.len(), + "insertion index (is {}) should be <= len (is {})", + index, + self.len() + ); + + *self = match core::mem::replace(self, ShortSlice::ZeroOne(None)) { + ShortSlice::ZeroOne(None) => ShortSlice::ZeroOne(Some(elt)), + ShortSlice::ZeroOne(Some(item)) => { + let items = if index == 0 { + vec![elt, item].into_boxed_slice() + } else { + vec![item, elt].into_boxed_slice() + }; + ShortSlice::Multi(items) + } + ShortSlice::Multi(items) => { + let mut items = items.into_vec(); + items.insert(index, elt); + ShortSlice::Multi(items.into_boxed_slice()) + } + } + } + + pub fn remove(&mut self, index: usize) -> T { + assert!( + index < self.len(), + "removal index (is {}) should be < len (is {})", + index, + self.len() + ); + + let (replaced, removed_item) = match core::mem::replace(self, ShortSlice::ZeroOne(None)) { + ShortSlice::ZeroOne(None) => unreachable!(), + ShortSlice::ZeroOne(Some(v)) => (ShortSlice::ZeroOne(None), v), + ShortSlice::Multi(v) => { + let mut v = v.into_vec(); + let removed_item = v.remove(index); + match v.len() { + #[allow(clippy::unwrap_used)] + // we know that the vec has exactly one element left + 1 => (ShortSlice::ZeroOne(Some(v.pop().unwrap())), removed_item), + // v has at least 2 elements, create a Multi variant + _ => (ShortSlice::Multi(v.into_boxed_slice()), removed_item), + } + } + }; + *self = replaced; + removed_item + } + + #[inline] + pub fn clear(&mut self) { + let _ = core::mem::replace(self, ShortSlice::ZeroOne(None)); + } + + pub fn retain<F>(&mut self, mut f: F) + where + F: FnMut(&T) -> bool, + { + *self = match core::mem::take(self) { + Self::ZeroOne(Some(one)) if f(&one) => Self::ZeroOne(Some(one)), + Self::ZeroOne(_) => Self::ZeroOne(None), + Self::Multi(slice) => { + let mut vec = slice.into_vec(); + vec.retain(f); + Self::from(vec) + } + }; + } +} + +impl<T> Deref for ShortSlice<T> { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + match self { + ShortSlice::ZeroOne(None) => &[], + ShortSlice::ZeroOne(Some(v)) => core::slice::from_ref(v), + ShortSlice::Multi(v) => v, + } + } +} + +impl<T> DerefMut for ShortSlice<T> { + fn deref_mut(&mut self) -> &mut Self::Target { + match self { + ShortSlice::ZeroOne(None) => &mut [], + ShortSlice::ZeroOne(Some(v)) => core::slice::from_mut(v), + ShortSlice::Multi(v) => v, + } + } +} + +impl<T> From<Vec<T>> for ShortSlice<T> { + fn from(v: Vec<T>) -> Self { + match v.len() { + 0 => ShortSlice::ZeroOne(None), + #[allow(clippy::unwrap_used)] // we know that the vec is not empty + 1 => ShortSlice::ZeroOne(Some(v.into_iter().next().unwrap())), + _ => ShortSlice::Multi(v.into_boxed_slice()), + } + } +} + +impl<T> Default for ShortSlice<T> { + fn default() -> Self { + ShortSlice::ZeroOne(None) + } +} + +impl<T> FromIterator<T> for ShortSlice<T> { + fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self { + let mut iter = iter.into_iter(); + match (iter.next(), iter.next()) { + (Some(first), Some(second)) => { + // Size hint behaviour same as `Vec::extend` + 2 + let mut vec = Vec::with_capacity(iter.size_hint().0.saturating_add(3)); + vec.push(first); + vec.push(second); + vec.extend(iter); + Self::Multi(vec.into_boxed_slice()) + } + (first, _) => Self::ZeroOne(first), + } + } +} + +impl<K, V> StoreConstEmpty<K, V> for ShortSlice<(K, V)> { + const EMPTY: ShortSlice<(K, V)> = ShortSlice::ZeroOne(None); +} + +impl<K, V> Store<K, V> for ShortSlice<(K, V)> { + #[inline] + fn lm_len(&self) -> usize { + self.len() + } + + #[inline] + fn lm_is_empty(&self) -> bool { + matches!(self, ShortSlice::ZeroOne(None)) + } + + #[inline] + fn lm_get(&self, index: usize) -> Option<(&K, &V)> { + self.get(index).map(|elt| (&elt.0, &elt.1)) + } + + #[inline] + fn lm_last(&self) -> Option<(&K, &V)> { + match self { + ShortSlice::ZeroOne(v) => v.as_ref(), + ShortSlice::Multi(v) => v.last(), + } + .map(|elt| (&elt.0, &elt.1)) + } + + #[inline] + fn lm_binary_search_by<F>(&self, mut cmp: F) -> Result<usize, usize> + where + F: FnMut(&K) -> core::cmp::Ordering, + { + self.binary_search_by(|(k, _)| cmp(k)) + } +} + +impl<K: Ord, V> StoreFromIterable<K, V> for ShortSlice<(K, V)> { + fn lm_sort_from_iter<I: IntoIterator<Item = (K, V)>>(iter: I) -> Self { + let v: Vec<(K, V)> = Vec::lm_sort_from_iter(iter); + v.into() + } +} + +impl<K, V> StoreMut<K, V> for ShortSlice<(K, V)> { + fn lm_with_capacity(_capacity: usize) -> Self { + ShortSlice::ZeroOne(None) + } + + fn lm_reserve(&mut self, _additional: usize) {} + + fn lm_get_mut(&mut self, index: usize) -> Option<(&K, &mut V)> { + self.get_mut(index).map(|elt| (&elt.0, &mut elt.1)) + } + + fn lm_push(&mut self, key: K, value: V) { + self.push((key, value)) + } + + fn lm_insert(&mut self, index: usize, key: K, value: V) { + self.insert(index, (key, value)) + } + + fn lm_remove(&mut self, index: usize) -> (K, V) { + self.remove(index) + } + + fn lm_clear(&mut self) { + self.clear(); + } + + fn lm_retain<F>(&mut self, mut predicate: F) + where + F: FnMut(&K, &V) -> bool, + { + self.retain(|(k, v)| predicate(k, v)) + } +} + +impl<'a, K: 'a, V: 'a> StoreIterable<'a, K, V> for ShortSlice<(K, V)> { + type KeyValueIter = + core::iter::Map<core::slice::Iter<'a, (K, V)>, for<'r> fn(&'r (K, V)) -> (&'r K, &'r V)>; + + fn lm_iter(&'a self) -> Self::KeyValueIter { + self.iter().map(|elt| (&elt.0, &elt.1)) + } +} + +impl<K, V> StoreFromIterator<K, V> for ShortSlice<(K, V)> {} + +#[test] +fn test_short_slice_impl() { + litemap::testing::check_store::<ShortSlice<(u32, u64)>>(); +} + +macro_rules! impl_tinystr_subtag { + ( + $(#[$doc:meta])* + $name:ident, + $($path:ident)::+, + $macro_name:ident, + $legacy_macro_name:ident, + $len_start:literal..=$len_end:literal, + $tinystr_ident:ident, + $validate:expr, + $normalize:expr, + $is_normalized:expr, + $error:ident, + [$good_example:literal $(,$more_good_examples:literal)*], + [$bad_example:literal $(, $more_bad_examples:literal)*], + ) => { + #[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)] + #[cfg_attr(feature = "serde", derive(serde::Serialize))] + #[repr(transparent)] + $(#[$doc])* + pub struct $name(tinystr::TinyAsciiStr<$len_end>); + + impl $name { + /// A constructor which takes a UTF-8 slice, parses it and + #[doc = concat!("produces a well-formed [`", stringify!($name), "`].")] + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use icu_locid::", stringify!($($path::)+), stringify!($name), ";")] + /// + #[doc = concat!("assert!(", stringify!($name), "::try_from_bytes(b", stringify!($good_example), ").is_ok());")] + #[doc = concat!("assert!(", stringify!($name), "::try_from_bytes(b", stringify!($bad_example), ").is_err());")] + /// ``` + pub const fn try_from_bytes(v: &[u8]) -> Result<Self, crate::parser::errors::ParserError> { + Self::try_from_bytes_manual_slice(v, 0, v.len()) + } + + /// Equivalent to [`try_from_bytes(bytes[start..end])`](Self::try_from_bytes), + /// but callable in a `const` context (which range indexing is not). + pub const fn try_from_bytes_manual_slice( + v: &[u8], + start: usize, + end: usize, + ) -> Result<Self, crate::parser::errors::ParserError> { + let slen = end - start; + + #[allow(clippy::double_comparisons)] // if len_start == len_end + if slen < $len_start || slen > $len_end { + return Err(crate::parser::errors::ParserError::$error); + } + + match tinystr::TinyAsciiStr::from_bytes_manual_slice(v, start, end) { + Ok($tinystr_ident) if $validate => Ok(Self($normalize)), + _ => Err(crate::parser::errors::ParserError::$error), + } + } + + #[doc = concat!("Safely creates a [`", stringify!($name), "`] from its raw format")] + /// as returned by [`Self::into_raw`]. Unlike [`Self::try_from_bytes`], + /// this constructor only takes normalized values. + pub const fn try_from_raw( + v: [u8; $len_end], + ) -> Result<Self, crate::parser::errors::ParserError> { + if let Ok($tinystr_ident) = tinystr::TinyAsciiStr::<$len_end>::try_from_raw(v) { + if $tinystr_ident.len() >= $len_start && $is_normalized { + Ok(Self($tinystr_ident)) + } else { + Err(crate::parser::errors::ParserError::$error) + } + } else { + Err(crate::parser::errors::ParserError::$error) + } + } + + #[doc = concat!("Unsafely creates a [`", stringify!($name), "`] from its raw format")] + /// as returned by [`Self::into_raw`]. Unlike [`Self::try_from_bytes`], + /// this constructor only takes normalized values. + /// + /// # Safety + /// + /// This function is safe iff [`Self::try_from_raw`] returns an `Ok`. This is the case + /// for inputs that are correctly normalized. + pub const unsafe fn from_raw_unchecked(v: [u8; $len_end]) -> Self { + Self(tinystr::TinyAsciiStr::from_bytes_unchecked(v)) + } + + /// Deconstructs into a raw format to be consumed by + /// [`from_raw_unchecked`](Self::from_raw_unchecked()) or + /// [`try_from_raw`](Self::try_from_raw()). + pub const fn into_raw(self) -> [u8; $len_end] { + *self.0.all_bytes() + } + + #[inline] + /// A helper function for displaying as a `&str`. + pub const fn as_str(&self) -> &str { + self.0.as_str() + } + + #[doc(hidden)] + pub const fn into_tinystr(&self) -> tinystr::TinyAsciiStr<$len_end> { + self.0 + } + + /// Compare with BCP-47 bytes. + /// + /// The return value is equivalent to what would happen if you first converted + /// `self` to a BCP-47 string and then performed a byte comparison. + /// + /// This function is case-sensitive and results in a *total order*, so it is appropriate for + /// binary search. The only argument producing [`Ordering::Equal`](core::cmp::Ordering::Equal) + /// is `self.as_str().as_bytes()`. + #[inline] + pub fn strict_cmp(self, other: &[u8]) -> core::cmp::Ordering { + self.as_str().as_bytes().cmp(other) + } + + /// Compare with a potentially unnormalized BCP-47 string. + /// + /// The return value is equivalent to what would happen if you first parsed the + /// BCP-47 string and then performed a structural comparison. + /// + #[inline] + pub fn normalizing_eq(self, other: &str) -> bool { + self.as_str().eq_ignore_ascii_case(other) + } + } + + impl core::str::FromStr for $name { + type Err = crate::parser::errors::ParserError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::try_from_bytes(source.as_bytes()) + } + } + + impl<'l> From<&'l $name> for &'l str { + fn from(input: &'l $name) -> Self { + input.as_str() + } + } + + impl From<$name> for tinystr::TinyAsciiStr<$len_end> { + fn from(input: $name) -> Self { + input.into_tinystr() + } + } + + impl writeable::Writeable for $name { + #[inline] + fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result { + sink.write_str(self.as_str()) + } + #[inline] + fn writeable_length_hint(&self) -> writeable::LengthHint { + writeable::LengthHint::exact(self.0.len()) + } + #[inline] + fn write_to_string(&self) -> alloc::borrow::Cow<str> { + alloc::borrow::Cow::Borrowed(self.0.as_str()) + } + } + + writeable::impl_display_with_writeable!($name); + + #[doc = concat!("A macro allowing for compile-time construction of valid [`", stringify!($name), "`] subtags.")] + /// + /// # Examples + /// + /// Parsing errors don't have to be handled at runtime: + /// ``` + /// assert_eq!( + #[doc = concat!(" icu_locid::", $(stringify!($path), "::",)+ stringify!($macro_name), "!(", stringify!($good_example) ,"),")] + #[doc = concat!(" ", stringify!($good_example), ".parse::<icu_locid::", $(stringify!($path), "::",)+ stringify!($name), ">().unwrap()")] + /// ); + /// ``` + /// + /// Invalid input is a compile failure: + /// ```compile_fail,E0080 + #[doc = concat!("icu_locid::", $(stringify!($path), "::",)+ stringify!($macro_name), "!(", stringify!($bad_example) ,");")] + /// ``` + /// + #[doc = concat!("[`", stringify!($name), "`]: crate::", $(stringify!($path), "::",)+ stringify!($name))] + #[macro_export] + #[doc(hidden)] + macro_rules! $legacy_macro_name { + ($string:literal) => {{ + use $crate::$($path ::)+ $name; + const R: $name = + match $name::try_from_bytes($string.as_bytes()) { + Ok(r) => r, + #[allow(clippy::panic)] // const context + _ => panic!(concat!("Invalid ", $(stringify!($path), "::",)+ stringify!($name), ": ", $string)), + }; + R + }}; + } + #[doc(inline)] + pub use $legacy_macro_name as $macro_name; + + #[cfg(feature = "databake")] + impl databake::Bake for $name { + fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { + env.insert("icu_locid"); + let string = self.as_str(); + databake::quote! { icu_locid::$($path::)+ $macro_name!(#string) } + } + } + + #[test] + fn test_construction() { + let maybe = $name::try_from_bytes($good_example.as_bytes()); + assert!(maybe.is_ok()); + assert_eq!(maybe, $name::try_from_raw(maybe.unwrap().into_raw())); + assert_eq!(maybe.unwrap().as_str(), $good_example); + $( + let maybe = $name::try_from_bytes($more_good_examples.as_bytes()); + assert!(maybe.is_ok()); + assert_eq!(maybe, $name::try_from_raw(maybe.unwrap().into_raw())); + assert_eq!(maybe.unwrap().as_str(), $more_good_examples); + )* + assert!($name::try_from_bytes($bad_example.as_bytes()).is_err()); + $( + assert!($name::try_from_bytes($more_bad_examples.as_bytes()).is_err()); + )* + } + + #[test] + fn test_writeable() { + writeable::assert_writeable_eq!(&$good_example.parse::<$name>().unwrap(), $good_example); + $( + writeable::assert_writeable_eq!($more_good_examples.parse::<$name>().unwrap(), $more_good_examples); + )* + } + + #[cfg(feature = "serde")] + impl<'de> serde::Deserialize<'de> for $name { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::de::Deserializer<'de>, + { + struct Visitor; + + impl<'de> serde::de::Visitor<'de> for Visitor { + type Value = $name; + + fn expecting( + &self, + formatter: &mut core::fmt::Formatter<'_>, + ) -> core::fmt::Result { + write!(formatter, "a valid BCP-47 {}", stringify!($name)) + } + + fn visit_str<E: serde::de::Error>(self, s: &str) -> Result<Self::Value, E> { + s.parse().map_err(serde::de::Error::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_string(Visitor) + } else { + Self::try_from_raw(serde::de::Deserialize::deserialize(deserializer)?) + .map_err(serde::de::Error::custom) + } + } + } + + // Safety checklist for ULE: + // + // 1. Must not include any uninitialized or padding bytes (true since transparent over a ULE). + // 2. Must have an alignment of 1 byte (true since transparent over a ULE). + // 3. ULE::validate_byte_slice() checks that the given byte slice represents a valid slice. + // 4. ULE::validate_byte_slice() checks that the given byte slice has a valid length. + // 5. All other methods must be left with their default impl. + // 6. Byte equality is semantic equality. + #[cfg(feature = "zerovec")] + unsafe impl zerovec::ule::ULE for $name { + fn validate_byte_slice(bytes: &[u8]) -> Result<(), zerovec::ZeroVecError> { + let it = bytes.chunks_exact(core::mem::size_of::<Self>()); + if !it.remainder().is_empty() { + return Err(zerovec::ZeroVecError::length::<Self>(bytes.len())); + } + for v in it { + // The following can be removed once `array_chunks` is stabilized. + let mut a = [0; core::mem::size_of::<Self>()]; + a.copy_from_slice(v); + if Self::try_from_raw(a).is_err() { + return Err(zerovec::ZeroVecError::parse::<Self>()); + } + } + Ok(()) + } + } + + #[cfg(feature = "zerovec")] + impl zerovec::ule::AsULE for $name { + type ULE = Self; + fn to_unaligned(self) -> Self::ULE { + self + } + fn from_unaligned(unaligned: Self::ULE) -> Self { + unaligned + } + } + + #[cfg(feature = "zerovec")] + impl<'a> zerovec::maps::ZeroMapKV<'a> for $name { + type Container = zerovec::ZeroVec<'a, $name>; + type Slice = zerovec::ZeroSlice<$name>; + type GetType = $name; + type OwnedType = $name; + } + }; +} + +macro_rules! impl_writeable_for_each_subtag_str_no_test { + ($type:tt $(, $self:ident, $borrow_cond:expr => $borrow:expr)?) => { + impl writeable::Writeable for $type { + fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result { + let mut initial = true; + self.for_each_subtag_str(&mut |subtag| { + if initial { + initial = false; + } else { + sink.write_char('-')?; + } + sink.write_str(subtag) + }) + } + + #[inline] + fn writeable_length_hint(&self) -> writeable::LengthHint { + let mut result = writeable::LengthHint::exact(0); + let mut initial = true; + self.for_each_subtag_str::<core::convert::Infallible, _>(&mut |subtag| { + if initial { + initial = false; + } else { + result += 1; + } + result += subtag.len(); + Ok(()) + }) + .expect("infallible"); + result + } + + $( + fn write_to_string(&self) -> alloc::borrow::Cow<str> { + #[allow(clippy::unwrap_used)] // impl_writeable_for_subtag_list's $borrow uses unwrap + let $self = self; + if $borrow_cond { + $borrow + } else { + let mut output = alloc::string::String::with_capacity(self.writeable_length_hint().capacity()); + let _ = self.write_to(&mut output); + alloc::borrow::Cow::Owned(output) + } + } + )? + } + + writeable::impl_display_with_writeable!($type); + }; +} + +macro_rules! impl_writeable_for_subtag_list { + ($type:tt, $sample1:literal, $sample2:literal) => { + impl_writeable_for_each_subtag_str_no_test!($type, selff, selff.0.len() == 1 => alloc::borrow::Cow::Borrowed(selff.0.get(0).unwrap().as_str())); + + #[test] + fn test_writeable() { + writeable::assert_writeable_eq!(&$type::default(), ""); + writeable::assert_writeable_eq!( + &$type::from_short_slice_unchecked(alloc::vec![$sample1.parse().unwrap()].into()), + $sample1, + ); + writeable::assert_writeable_eq!( + &$type::from_short_slice_unchecked(vec![ + $sample1.parse().unwrap(), + $sample2.parse().unwrap() + ].into()), + core::concat!($sample1, "-", $sample2), + ); + } + }; +} + +macro_rules! impl_writeable_for_key_value { + ($type:tt, $key1:literal, $value1:literal, $key2:literal, $expected2:literal) => { + impl_writeable_for_each_subtag_str_no_test!($type); + + #[test] + fn test_writeable() { + writeable::assert_writeable_eq!(&$type::default(), ""); + writeable::assert_writeable_eq!( + &$type::from_tuple_vec(vec![($key1.parse().unwrap(), $value1.parse().unwrap())]), + core::concat!($key1, "-", $value1), + ); + writeable::assert_writeable_eq!( + &$type::from_tuple_vec(vec![ + ($key1.parse().unwrap(), $value1.parse().unwrap()), + ($key2.parse().unwrap(), "true".parse().unwrap()) + ]), + core::concat!($key1, "-", $value1, "-", $expected2), + ); + } + }; +} diff --git a/third_party/rust/icu_locid/src/langid.rs b/third_party/rust/icu_locid/src/langid.rs new file mode 100644 index 0000000000..eac8c83713 --- /dev/null +++ b/third_party/rust/icu_locid/src/langid.rs @@ -0,0 +1,574 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::cmp::Ordering; +use core::str::FromStr; + +use crate::ordering::SubtagOrderingResult; +use crate::parser::{ + parse_language_identifier, parse_language_identifier_with_single_variant, ParserError, + ParserMode, SubtagIterator, +}; +use crate::subtags; +use alloc::string::String; +use writeable::Writeable; + +/// A core struct representing a [`Unicode BCP47 Language Identifier`]. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{ +/// langid, +/// subtags::{language, region}, +/// }; +/// +/// let li = langid!("en-US"); +/// +/// assert_eq!(li.language, language!("en")); +/// assert_eq!(li.script, None); +/// assert_eq!(li.region, Some(region!("US"))); +/// assert_eq!(li.variants.len(), 0); +/// ``` +/// +/// # Parsing +/// +/// Unicode recognizes three levels of standard conformance for any language identifier: +/// +/// * *well-formed* - syntactically correct +/// * *valid* - well-formed and only uses registered language, region, script and variant subtags... +/// * *canonical* - valid and no deprecated codes or structure. +/// +/// At the moment parsing normalizes a well-formed language identifier converting +/// `_` separators to `-` and adjusting casing to conform to the Unicode standard. +/// +/// Any bogus subtags will cause the parsing to fail with an error. +/// No subtag validation is performed. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{ +/// langid, +/// subtags::{language, region, script, variant}, +/// }; +/// +/// let li = langid!("eN_latn_Us-Valencia"); +/// +/// assert_eq!(li.language, language!("en")); +/// assert_eq!(li.script, Some(script!("Latn"))); +/// assert_eq!(li.region, Some(region!("US"))); +/// assert_eq!(li.variants.get(0), Some(&variant!("valencia"))); +/// ``` +/// +/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier +#[derive(Default, PartialEq, Eq, Clone, Hash)] +#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) +pub struct LanguageIdentifier { + /// Language subtag of the language identifier. + pub language: subtags::Language, + /// Script subtag of the language identifier. + pub script: Option<subtags::Script>, + /// Region subtag of the language identifier. + pub region: Option<subtags::Region>, + /// Variant subtags of the language identifier. + pub variants: subtags::Variants, +} + +impl LanguageIdentifier { + /// A constructor which takes a utf8 slice, parses it and + /// produces a well-formed [`LanguageIdentifier`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::LanguageIdentifier; + /// + /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed"); + /// ``` + pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { + parse_language_identifier(v, ParserMode::LanguageIdentifier) + } + + #[doc(hidden)] + #[allow(clippy::type_complexity)] + // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops` + // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)). + pub const fn try_from_bytes_with_single_variant( + v: &[u8], + ) -> Result< + ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Option<subtags::Variant>, + ), + ParserError, + > { + parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier) + } + + /// A constructor which takes a utf8 slice which may contain extension keys, + /// parses it and produces a well-formed [`LanguageIdentifier`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::{langid, LanguageIdentifier}; + /// + /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix") + /// .expect("Parsing failed."); + /// + /// assert_eq!(li, langid!("en-US")); + /// ``` + /// + /// This method should be used for input that may be a locale identifier. + /// All extensions will be lost. + pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> { + parse_language_identifier(v, ParserMode::Locale) + } + + /// The default undefined language "und". Same as [`default()`](Default::default()). + /// + /// # Examples + /// + /// ``` + /// use icu::locid::LanguageIdentifier; + /// + /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND); + /// ``` + pub const UND: Self = Self { + language: subtags::Language::UND, + script: None, + region: None, + variants: subtags::Variants::new(), + }; + + /// This is a best-effort operation that performs all available levels of canonicalization. + /// + /// At the moment the operation will normalize casing and the separator, but in the future + /// it may also validate and update from deprecated subtags to canonical ones. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::LanguageIdentifier; + /// + /// assert_eq!( + /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(), + /// Ok("pl-Latn-PL") + /// ); + /// ``` + pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { + let lang_id = Self::try_from_bytes(input.as_ref())?; + Ok(lang_id.write_to_string().into_owned()) + } + + /// Compare this [`LanguageIdentifier`] with BCP-47 bytes. + /// + /// The return value is equivalent to what would happen if you first converted this + /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison. + /// + /// This function is case-sensitive and results in a *total order*, so it is appropriate for + /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::LanguageIdentifier; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = &[ + /// "pl-Latn-PL", + /// "und", + /// "und-Adlm", + /// "und-GB", + /// "und-ZA", + /// "und-fonipa", + /// "zh", + /// ]; + /// + /// for ab in bcp47_strings.windows(2) { + /// let a = ab[0]; + /// let b = ab[1]; + /// assert!(a.cmp(b) == Ordering::Less); + /// let a_langid = a.parse::<LanguageIdentifier>().unwrap(); + /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal); + /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less); + /// } + /// ``` + pub fn strict_cmp(&self, other: &[u8]) -> Ordering { + self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + } + + /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags. + /// + /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as + /// a more modular version that allows multiple subtag iterators to be chained together. + /// + /// For an additional example, see [`SubtagOrderingResult`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::LanguageIdentifier; + /// use std::cmp::Ordering; + /// + /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"]; + /// + /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap(); + /// assert_eq!( + /// Ordering::Equal, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap(); + /// assert_eq!( + /// Ordering::Less, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap(); + /// assert_eq!( + /// Ordering::Greater, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// ``` + pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> + where + I: Iterator<Item = &'l [u8]>, + { + let r = self.for_each_subtag_str(&mut |subtag| { + if let Some(other) = subtags.next() { + match subtag.as_bytes().cmp(other) { + Ordering::Equal => Ok(()), + not_equal => Err(not_equal), + } + } else { + Err(Ordering::Greater) + } + }); + match r { + Ok(_) => SubtagOrderingResult::Subtags(subtags), + Err(o) => SubtagOrderingResult::Ordering(o), + } + } + + /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string. + /// + /// The return value is equivalent to what would happen if you first parsed the + /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::LanguageIdentifier; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = &[ + /// "pl-LaTn-pL", + /// "uNd", + /// "UnD-adlm", + /// "uNd-GB", + /// "UND-FONIPA", + /// "ZH", + /// ]; + /// + /// for a in bcp47_strings { + /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a)); + /// } + /// ``` + pub fn normalizing_eq(&self, other: &str) -> bool { + macro_rules! subtag_matches { + ($T:ty, $iter:ident, $expected:expr) => { + $iter + .next() + .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) + .unwrap_or(false) + }; + } + + let mut iter = SubtagIterator::new(other.as_bytes()); + if !subtag_matches!(subtags::Language, iter, self.language) { + return false; + } + if let Some(ref script) = self.script { + if !subtag_matches!(subtags::Script, iter, *script) { + return false; + } + } + if let Some(ref region) = self.region { + if !subtag_matches!(subtags::Region, iter, *region) { + return false; + } + } + for variant in self.variants.iter() { + if !subtag_matches!(subtags::Variant, iter, *variant) { + return false; + } + } + iter.next().is_none() + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + f(self.language.as_str())?; + if let Some(ref script) = self.script { + f(script.as_str())?; + } + if let Some(ref region) = self.region { + f(region.as_str())?; + } + for variant in self.variants.iter() { + f(variant.as_str())?; + } + Ok(()) + } + + /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in + /// lowercase ascii form. + /// + /// The default canonicalization of language identifiers uses titlecase scripts and uppercase + /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies: + /// + /// > _The canonical form for all subtags in the extension is lowercase, with the fields + /// ordered by the separators, alphabetically._ + /// + /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct + /// canonicalization of the language identifier. + /// + /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is + /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions, + /// but titlecased and uppercased outside T extensions respectively. + /// + /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt + /// [`Transform extensions`]: crate::extensions::transform + pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + f(self.language.as_str())?; + if let Some(ref script) = self.script { + f(script.into_tinystr().to_ascii_lowercase().as_str())?; + } + if let Some(ref region) = self.region { + f(region.into_tinystr().to_ascii_lowercase().as_str())?; + } + for variant in self.variants.iter() { + f(variant.as_str())?; + } + Ok(()) + } + + /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with + /// lowercase ascii chars. + /// + /// The default canonicalization of language identifiers uses titlecase scripts and uppercase + /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies: + /// + /// > _The canonical form for all subtags in the extension is lowercase, with the fields + /// ordered by the separators, alphabetically._ + /// + /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct + /// canonicalization of the language identifier. + /// + /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is + /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions, + /// but titlecased and uppercased outside T extensions respectively. + /// + /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt + /// [`Transform extensions`]: crate::extensions::transform + pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>( + &self, + sink: &mut W, + ) -> core::fmt::Result { + let mut initial = true; + self.for_each_subtag_str_lowercased(&mut |subtag| { + if initial { + initial = false; + } else { + sink.write_char('-')?; + } + sink.write_str(subtag) + }) + } +} + +impl AsRef<LanguageIdentifier> for LanguageIdentifier { + fn as_ref(&self) -> &Self { + self + } +} + +impl AsMut<LanguageIdentifier> for LanguageIdentifier { + fn as_mut(&mut self) -> &mut Self { + self + } +} + +impl core::fmt::Debug for LanguageIdentifier { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + core::fmt::Display::fmt(&self, f) + } +} + +impl FromStr for LanguageIdentifier { + type Err = ParserError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::try_from_bytes(source.as_bytes()) + } +} + +impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string()); + +#[test] +fn test_writeable() { + use writeable::assert_writeable_eq; + assert_writeable_eq!(LanguageIdentifier::UND, "und"); + assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001"); + assert_writeable_eq!( + "und-Mymr".parse::<LanguageIdentifier>().unwrap(), + "und-Mymr", + ); + assert_writeable_eq!( + "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(), + "my-Mymr-MM", + ); + assert_writeable_eq!( + "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(), + "my-Mymr-MM-posix", + ); + assert_writeable_eq!( + "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(), + "zh-macos-posix", + ); +} + +/// # Examples +/// +/// ``` +/// use icu::locid::{langid, subtags::language, LanguageIdentifier}; +/// +/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en")); +/// ``` +impl From<subtags::Language> for LanguageIdentifier { + fn from(language: subtags::Language) -> Self { + Self { + language, + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::{langid, subtags::script, LanguageIdentifier}; +/// +/// assert_eq!( +/// LanguageIdentifier::from(Some(script!("latn"))), +/// langid!("und-Latn") +/// ); +/// ``` +impl From<Option<subtags::Script>> for LanguageIdentifier { + fn from(script: Option<subtags::Script>) -> Self { + Self { + script, + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::{langid, subtags::region, LanguageIdentifier}; +/// +/// assert_eq!( +/// LanguageIdentifier::from(Some(region!("US"))), +/// langid!("und-US") +/// ); +/// ``` +impl From<Option<subtags::Region>> for LanguageIdentifier { + fn from(region: Option<subtags::Region>) -> Self { + Self { + region, + ..Default::default() + } + } +} + +/// Convert from an LSR tuple to a [`LanguageIdentifier`]. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{ +/// langid, +/// subtags::{language, region, script}, +/// LanguageIdentifier, +/// }; +/// +/// let lang = language!("en"); +/// let script = script!("Latn"); +/// let region = region!("US"); +/// assert_eq!( +/// LanguageIdentifier::from((lang, Some(script), Some(region))), +/// langid!("en-Latn-US") +/// ); +/// ``` +impl + From<( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + )> for LanguageIdentifier +{ + fn from( + lsr: ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + ), + ) -> Self { + Self { + language: lsr.0, + script: lsr.1, + region: lsr.2, + ..Default::default() + } + } +} + +/// Convert from a [`LanguageIdentifier`] to an LSR tuple. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{ +/// langid, +/// subtags::{language, region, script}, +/// }; +/// +/// let lid = langid!("en-Latn-US"); +/// let (lang, script, region) = (&lid).into(); +/// +/// assert_eq!(lang, language!("en")); +/// assert_eq!(script, Some(script!("Latn"))); +/// assert_eq!(region, Some(region!("US"))); +/// ``` +impl From<&LanguageIdentifier> + for ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + ) +{ + fn from(langid: &LanguageIdentifier) -> Self { + (langid.language, langid.script, langid.region) + } +} diff --git a/third_party/rust/icu_locid/src/lib.rs b/third_party/rust/icu_locid/src/lib.rs new file mode 100644 index 0000000000..9c6c46ca51 --- /dev/null +++ b/third_party/rust/icu_locid/src/lib.rs @@ -0,0 +1,93 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Parsing, manipulating, and serializing Unicode Language and Locale Identifiers. +//! +//! This module is published as its own crate ([`icu_locid`](https://docs.rs/icu_locid/latest/icu_locid/)) +//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. +//! +//! The module provides algorithms for parsing a string into a well-formed language or locale identifier +//! as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]. +//! +//! [`Locale`] is the most common structure to use for storing information about a language, +//! script, region, variants and extensions. In almost all cases, this struct should be used as the +//! base unit for all locale management operations. +//! +//! [`LanguageIdentifier`] is a strict subset of [`Locale`] which can be useful in a narrow range of +//! cases where [`Unicode Extensions`] are not relevant. +//! +//! If in doubt, use [`Locale`]. +//! +//! # Examples +//! +//! ``` +//! use icu::locid::Locale; +//! use icu::locid::{ +//! locale, +//! subtags::{language, region}, +//! }; +//! +//! let mut loc: Locale = locale!("en-US"); +//! +//! assert_eq!(loc.id.language, language!("en")); +//! assert_eq!(loc.id.script, None); +//! assert_eq!(loc.id.region, Some(region!("US"))); +//! assert_eq!(loc.id.variants.len(), 0); +//! +//! loc.id.region = Some(region!("GB")); +//! +//! assert_eq!(loc, locale!("en-GB")); +//! ``` +//! +//! For more details, see [`Locale`] and [`LanguageIdentifier`]. +//! +//! [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]: https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers +//! [`ICU4X`]: ../icu/index.html +//! [`Unicode Extensions`]: extensions + +// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations +#![cfg_attr(not(any(test, feature = "std")), no_std)] +#![cfg_attr( + not(test), + deny( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::exhaustive_structs, + clippy::exhaustive_enums, + missing_debug_implementations, + ) +)] +#![warn(missing_docs)] + +extern crate alloc; + +#[macro_use] +mod helpers; + +mod langid; +mod locale; +mod macros; +mod ordering; +mod parser; + +pub use langid::LanguageIdentifier; +pub use locale::Locale; +pub use ordering::SubtagOrderingResult; +pub use parser::errors::ParserError; + +#[doc(no_inline)] +pub use ParserError as Error; + +pub mod extensions; +#[macro_use] +pub mod subtags; +pub mod zerovec; + +#[cfg(feature = "serde")] +mod serde; + +#[cfg(feature = "databake")] +mod databake; diff --git a/third_party/rust/icu_locid/src/locale.rs b/third_party/rust/icu_locid/src/locale.rs new file mode 100644 index 0000000000..e87cdf1a20 --- /dev/null +++ b/third_party/rust/icu_locid/src/locale.rs @@ -0,0 +1,511 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::ordering::SubtagOrderingResult; +use crate::parser::{ + parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, + ParserError, ParserMode, SubtagIterator, +}; +use crate::{extensions, subtags, LanguageIdentifier}; +use alloc::string::String; +use core::cmp::Ordering; +use core::str::FromStr; +use tinystr::TinyAsciiStr; +use writeable::Writeable; + +/// A core struct representing a [`Unicode Locale Identifier`]. +/// +/// A locale is made of two parts: +/// * Unicode Language Identifier +/// * A set of Unicode Extensions +/// +/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and +/// on top of that is able to parse, manipulate and serialize unicode extension fields. +/// +/// +/// # Examples +/// +/// ``` +/// use icu_locid::{ +/// extensions::unicode::{key, value}, +/// locale, +/// subtags::{language, region}, +/// }; +/// +/// let loc = locale!("en-US-u-ca-buddhist"); +/// +/// assert_eq!(loc.id.language, language!("en")); +/// assert_eq!(loc.id.script, None); +/// assert_eq!(loc.id.region, Some(region!("US"))); +/// assert_eq!(loc.id.variants.len(), 0); +/// assert_eq!( +/// loc.extensions.unicode.keywords.get(&key!("ca")), +/// Some(&value!("buddhist")) +/// ); +/// ``` +/// +/// # Parsing +/// +/// Unicode recognizes three levels of standard conformance for a locale: +/// +/// * *well-formed* - syntactically correct +/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... +/// * *canonical* - valid and no deprecated codes or structure. +/// +/// At the moment parsing normalizes a well-formed locale identifier converting +/// `_` separators to `-` and adjusting casing to conform to the Unicode standard. +/// +/// Any bogus subtags will cause the parsing to fail with an error. +/// +/// No subtag validation or alias resolution is performed. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{subtags::*, Locale}; +/// +/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12" +/// .parse() +/// .expect("Failed to parse."); +/// +/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap()); +/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok()); +/// assert_eq!(loc.id.region, "US".parse::<Region>().ok()); +/// assert_eq!( +/// loc.id.variants.get(0), +/// "valencia".parse::<Variant>().ok().as_ref() +/// ); +/// ``` +/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier +#[derive(Default, PartialEq, Eq, Clone, Hash)] +#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) +pub struct Locale { + /// The basic language/script/region components in the locale identifier along with any variants. + pub id: LanguageIdentifier, + /// Any extensions present in the locale identifier. + pub extensions: extensions::Extensions, +} + +#[test] +fn test_sizes() { + assert_eq!(core::mem::size_of::<subtags::Language>(), 3); + assert_eq!(core::mem::size_of::<subtags::Script>(), 4); + assert_eq!(core::mem::size_of::<subtags::Region>(), 3); + assert_eq!(core::mem::size_of::<subtags::Variant>(), 8); + assert_eq!(core::mem::size_of::<subtags::Variants>(), 16); + assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32); + + assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56); + assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32); + assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24); + + assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16); + assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24); + assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24); + assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16); + assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136); + + assert_eq!(core::mem::size_of::<Locale>(), 168); +} + +impl Locale { + /// A constructor which takes a utf8 slice, parses it and + /// produces a well-formed [`Locale`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap(); + /// ``` + pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { + parse_locale(v) + } + + /// The default undefined locale "und". Same as [`default()`](Default::default()). + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// assert_eq!(Locale::default(), Locale::UND); + /// ``` + pub const UND: Self = Self { + id: LanguageIdentifier::UND, + extensions: extensions::Extensions::new(), + }; + + /// This is a best-effort operation that performs all available levels of canonicalization. + /// + /// At the moment the operation will normalize casing and the separator, but in the future + /// it may also validate and update from deprecated subtags to canonical ones. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// assert_eq!( + /// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(), + /// Ok("pl-Latn-PL-u-hc-h12") + /// ); + /// ``` + pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { + let locale = Self::try_from_bytes(input.as_ref())?; + Ok(locale.write_to_string().into_owned()) + } + + /// Compare this [`Locale`] with BCP-47 bytes. + /// + /// The return value is equivalent to what would happen if you first converted this + /// [`Locale`] to a BCP-47 string and then performed a byte comparison. + /// + /// This function is case-sensitive and results in a *total order*, so it is appropriate for + /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = &[ + /// "pl-Latn-PL", + /// "und", + /// "und-fonipa", + /// "und-t-m0-true", + /// "und-u-ca-hebrew", + /// "und-u-ca-japanese", + /// "zh", + /// ]; + /// + /// for ab in bcp47_strings.windows(2) { + /// let a = ab[0]; + /// let b = ab[1]; + /// assert!(a.cmp(b) == Ordering::Less); + /// let a_loc = a.parse::<Locale>().unwrap(); + /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal); + /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less); + /// } + /// ``` + pub fn strict_cmp(&self, other: &[u8]) -> Ordering { + self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + } + + /// Compare this [`Locale`] with an iterator of BCP-47 subtags. + /// + /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as + /// a more modular version that allows multiple subtag iterators to be chained together. + /// + /// For an additional example, see [`SubtagOrderingResult`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use std::cmp::Ordering; + /// + /// let subtags: &[&[u8]] = + /// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"]; + /// + /// let loc = locale!("ca-ES-valencia-u-ca-hebrew"); + /// assert_eq!( + /// Ordering::Equal, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let loc = locale!("ca-ES-valencia"); + /// assert_eq!( + /// Ordering::Less, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let loc = locale!("ca-ES-valencia-u-nu-arab"); + /// assert_eq!( + /// Ordering::Greater, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// ``` + pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> + where + I: Iterator<Item = &'l [u8]>, + { + let r = self.for_each_subtag_str(&mut |subtag| { + if let Some(other) = subtags.next() { + match subtag.as_bytes().cmp(other) { + Ordering::Equal => Ok(()), + not_equal => Err(not_equal), + } + } else { + Err(Ordering::Greater) + } + }); + match r { + Ok(_) => SubtagOrderingResult::Subtags(subtags), + Err(o) => SubtagOrderingResult::Ordering(o), + } + } + + /// Compare this `Locale` with a potentially unnormalized BCP-47 string. + /// + /// The return value is equivalent to what would happen if you first parsed the + /// BCP-47 string to a `Locale` and then performed a structural comparison. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = &[ + /// "pl-LaTn-pL", + /// "uNd", + /// "UND-FONIPA", + /// "UnD-t-m0-TrUe", + /// "uNd-u-CA-Japanese", + /// "ZH", + /// ]; + /// + /// for a in bcp47_strings { + /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a)); + /// } + /// ``` + pub fn normalizing_eq(&self, other: &str) -> bool { + macro_rules! subtag_matches { + ($T:ty, $iter:ident, $expected:expr) => { + $iter + .next() + .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) + .unwrap_or(false) + }; + } + + let mut iter = SubtagIterator::new(other.as_bytes()); + if !subtag_matches!(subtags::Language, iter, self.id.language) { + return false; + } + if let Some(ref script) = self.id.script { + if !subtag_matches!(subtags::Script, iter, *script) { + return false; + } + } + if let Some(ref region) = self.id.region { + if !subtag_matches!(subtags::Region, iter, *region) { + return false; + } + } + for variant in self.id.variants.iter() { + if !subtag_matches!(subtags::Variant, iter, *variant) { + return false; + } + } + if !self.extensions.is_empty() { + match extensions::Extensions::try_from_iter(&mut iter) { + Ok(exts) => { + if self.extensions != exts { + return false; + } + } + Err(_) => { + return false; + } + } + } + iter.next().is_none() + } + + #[doc(hidden)] + #[allow(clippy::type_complexity)] + pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension( + v: &[u8], + ) -> Result< + ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Option<subtags::Variant>, + Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>, + ), + ParserError, + > { + parse_locale_with_single_variant_single_keyword_unicode_keyword_extension( + v, + ParserMode::Locale, + ) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + self.id.for_each_subtag_str(f)?; + self.extensions.for_each_subtag_str(f)?; + Ok(()) + } +} + +impl FromStr for Locale { + type Err = ParserError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::try_from_bytes(source.as_bytes()) + } +} + +impl From<LanguageIdentifier> for Locale { + fn from(id: LanguageIdentifier) -> Self { + Self { + id, + extensions: extensions::Extensions::default(), + } + } +} + +impl From<Locale> for LanguageIdentifier { + fn from(loc: Locale) -> Self { + loc.id + } +} + +impl AsRef<LanguageIdentifier> for Locale { + fn as_ref(&self) -> &LanguageIdentifier { + &self.id + } +} + +impl AsMut<LanguageIdentifier> for Locale { + fn as_mut(&mut self) -> &mut LanguageIdentifier { + &mut self.id + } +} + +impl core::fmt::Debug for Locale { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + writeable::Writeable::write_to(self, f) + } +} + +impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string()); + +#[test] +fn test_writeable() { + use writeable::assert_writeable_eq; + assert_writeable_eq!(Locale::UND, "und"); + assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001"); + assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr"); + assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM"); + assert_writeable_eq!( + "my-Mymr-MM-posix".parse::<Locale>().unwrap(), + "my-Mymr-MM-posix", + ); + assert_writeable_eq!( + "zh-macos-posix".parse::<Locale>().unwrap(), + "zh-macos-posix", + ); + assert_writeable_eq!( + "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(), + "my-t-my-d0-zawgyi", + ); + assert_writeable_eq!( + "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(), + "ar-SA-u-ca-islamic-civil", + ); + assert_writeable_eq!( + "en-001-x-foo-bar".parse::<Locale>().unwrap(), + "en-001-x-foo-bar", + ); + assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",); +} + +/// # Examples +/// +/// ``` +/// use icu::locid::Locale; +/// use icu::locid::{locale, subtags::language}; +/// +/// assert_eq!(Locale::from(language!("en")), locale!("en")); +/// ``` +impl From<subtags::Language> for Locale { + fn from(language: subtags::Language) -> Self { + Self { + id: language.into(), + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::Locale; +/// use icu::locid::{locale, subtags::script}; +/// +/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn")); +/// ``` +impl From<Option<subtags::Script>> for Locale { + fn from(script: Option<subtags::Script>) -> Self { + Self { + id: script.into(), + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::Locale; +/// use icu::locid::{locale, subtags::region}; +/// +/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US")); +/// ``` +impl From<Option<subtags::Region>> for Locale { + fn from(region: Option<subtags::Region>) -> Self { + Self { + id: region.into(), + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::Locale; +/// use icu::locid::{ +/// locale, +/// subtags::{language, region, script}, +/// }; +/// +/// assert_eq!( +/// Locale::from(( +/// language!("en"), +/// Some(script!("Latn")), +/// Some(region!("US")) +/// )), +/// locale!("en-Latn-US") +/// ); +/// ``` +impl + From<( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + )> for Locale +{ + fn from( + lsr: ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + ), + ) -> Self { + Self { + id: lsr.into(), + ..Default::default() + } + } +} diff --git a/third_party/rust/icu_locid/src/macros.rs b/third_party/rust/icu_locid/src/macros.rs new file mode 100644 index 0000000000..4537cd4031 --- /dev/null +++ b/third_party/rust/icu_locid/src/macros.rs @@ -0,0 +1,191 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +/// A macro allowing for compile-time construction of valid [`LanguageIdentifier`]s. +/// +/// The macro will perform syntax canonicalization of the tag. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{langid, LanguageIdentifier}; +/// +/// const DE_AT: LanguageIdentifier = langid!("de_at"); +/// +/// let de_at: LanguageIdentifier = "de_at".parse().unwrap(); +/// +/// assert_eq!(DE_AT, de_at); +/// ``` +/// +/// *Note*: The macro cannot produce language identifiers with more than one variants due to const +/// limitations (see [`Heap Allocations in Constants`]): +/// +/// ```compile_fail,E0080 +/// icu::locid::langid!("und-variant1-variant2"); +/// ``` +/// +/// Use runtime parsing instead: +/// ``` +/// "und-variant1-variant2" +/// .parse::<icu::locid::LanguageIdentifier>() +/// .unwrap(); +/// ``` +/// +/// [`LanguageIdentifier`]: crate::LanguageIdentifier +/// [`Heap Allocations in Constants`]: https://github.com/rust-lang/const-eval/issues/20 +#[macro_export] +macro_rules! langid { + ($langid:literal) => {{ + const R: $crate::LanguageIdentifier = + match $crate::LanguageIdentifier::try_from_bytes_with_single_variant($langid.as_bytes()) { + Ok((language, script, region, variant)) => $crate::LanguageIdentifier { + language, + script, + region, + variants: match variant { + Some(v) => $crate::subtags::Variants::from_variant(v), + None => $crate::subtags::Variants::new(), + } + }, + #[allow(clippy::panic)] // const context + _ => panic!(concat!("Invalid language code: ", $langid, " . Note langid! macro can only support up to a single variant tag. Use runtime parsing instead.")), + }; + R + }}; +} + +/// A macro allowing for compile-time construction of valid [`Locale`]s. +/// +/// The macro will perform syntax canonicalization of the tag. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{locale, Locale}; +/// +/// const DE_AT: Locale = locale!("de_at"); +/// +/// let de_at: Locale = "de_at".parse().unwrap(); +/// +/// assert_eq!(DE_AT, de_at); +/// ``` +/// +/// *Note*: The macro cannot produce locales with more than one variant or multiple extensions +/// (only single keyword unicode extension is supported) due to const +/// limitations (see [`Heap Allocations in Constants`]): +/// +/// ```compile_fail,E0080 +/// icu::locid::locale!("sl-IT-rozaj-biske-1994"); +/// ``` +/// Use runtime parsing instead: +/// ``` +/// "sl-IT-rozaj-biske-1994" +/// .parse::<icu::locid::Locale>() +/// .unwrap(); +/// ``` +/// +/// Locales with multiple keys are not supported +/// ```compile_fail,E0080 +/// icu::locid::locale!("th-TH-u-ca-buddhist-nu-thai"); +/// ``` +/// Use runtime parsing instead: +/// ``` +/// "th-TH-u-ca-buddhist-nu-thai" +/// .parse::<icu::locid::Locale>() +/// .unwrap(); +/// ``` +/// +/// Locales with attributes are not supported +/// ```compile_fail,E0080 +/// icu::locid::locale!("en-US-u-foobar-ca-buddhist"); +/// ``` +/// Use runtime parsing instead: +/// ``` +/// "en-US-u-foobar-ca-buddhist" +/// .parse::<icu::locid::Locale>() +/// .unwrap(); +/// ``` +/// +/// Locales with single key but multiple types are not supported +/// ```compile_fail,E0080 +/// icu::locid::locale!("en-US-u-ca-islamic-umalqura"); +/// ``` +/// Use runtime parsing instead: +/// ``` +/// "en-US-u-ca-islamic-umalqura" +/// .parse::<icu::locid::Locale>() +/// .unwrap(); +/// ``` +/// [`Locale`]: crate::Locale +/// [`Heap Allocations in Constants`]: https://github.com/rust-lang/const-eval/issues/20 +#[macro_export] +macro_rules! locale { + ($locale:literal) => {{ + const R: $crate::Locale = + match $crate::Locale::try_from_bytes_with_single_variant_single_keyword_unicode_extension( + $locale.as_bytes(), + ) { + Ok((language, script, region, variant, keyword)) => $crate::Locale { + id: $crate::LanguageIdentifier { + language, + script, + region, + variants: match variant { + Some(v) => $crate::subtags::Variants::from_variant(v), + None => $crate::subtags::Variants::new(), + }, + }, + extensions: match keyword { + Some(k) => $crate::extensions::Extensions::from_unicode( + $crate::extensions::unicode::Unicode { + keywords: $crate::extensions::unicode::Keywords::new_single( + k.0, + $crate::extensions::unicode::Value::from_tinystr(k.1), + ), + + attributes: $crate::extensions::unicode::Attributes::new(), + }, + ), + None => $crate::extensions::Extensions::new(), + }, + }, + #[allow(clippy::panic)] // const context + _ => panic!(concat!( + "Invalid language code: ", + $locale, + " . Note the locale! macro only supports up to one variant tag; \ + unicode extensions are not supported. Use \ + runtime parsing instead." + )), + }; + R + }}; +} + +#[cfg(test)] +mod test { + use crate::LanguageIdentifier; + use crate::Locale; + + #[test] + fn test_langid_macro_can_parse_langid_with_single_variant() { + const DE_AT_FOOBAR: LanguageIdentifier = langid!("de_at-foobar"); + let de_at_foobar: LanguageIdentifier = "de_at-foobar".parse().unwrap(); + assert_eq!(DE_AT_FOOBAR, de_at_foobar); + } + + #[test] + fn test_locale_macro_can_parse_locale_with_single_variant() { + const DE_AT_FOOBAR: Locale = locale!("de_at-foobar"); + let de_at_foobar: Locale = "de_at-foobar".parse().unwrap(); + assert_eq!(DE_AT_FOOBAR, de_at_foobar); + } + + #[test] + fn test_locale_macro_can_parse_locale_with_single_keyword_unicode_extension() { + const DE_AT_U_CA_FOOBAR: Locale = locale!("de_at-u-ca-foobar"); + let de_at_u_ca_foobar: Locale = "de_at-u-ca-foobar".parse().unwrap(); + assert_eq!(DE_AT_U_CA_FOOBAR, de_at_u_ca_foobar); + } +} diff --git a/third_party/rust/icu_locid/src/ordering.rs b/third_party/rust/icu_locid/src/ordering.rs new file mode 100644 index 0000000000..c877c60c39 --- /dev/null +++ b/third_party/rust/icu_locid/src/ordering.rs @@ -0,0 +1,62 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Utilities for performing ordering operations on locales. + +use core::cmp::Ordering; + +/// The result of a subtag iterator comparison operation. +/// +/// See [`Locale::strict_cmp_iter`]. +/// +/// # Examples +/// +/// Check whether a stream of subtags contains two expected locales back-to-back: +/// +/// ``` +/// use icu::locid::{locale, Locale, SubtagOrderingResult}; +/// use std::cmp::Ordering; +/// +/// let subtags = b"en-US-it-IT".split(|b| *b == b'-'); +/// let locales = [locale!("en-US"), locale!("it-IT")]; +/// let mut result = SubtagOrderingResult::Subtags(subtags); +/// for loc in locales.iter() { +/// match result { +/// SubtagOrderingResult::Subtags(it) => { +/// result = loc.strict_cmp_iter(it); +/// } +/// SubtagOrderingResult::Ordering(ord) => break, +/// } +/// } +/// +/// assert_eq!(Ordering::Equal, result.end()); +/// ``` +/// +/// [`Locale::strict_cmp_iter`]: crate::Locale::strict_cmp_iter +#[allow(clippy::exhaustive_enums)] // well-defined exhaustive enum semantics +#[derive(Debug)] +pub enum SubtagOrderingResult<I> { + /// Potentially remaining subtags after the comparison operation. + Subtags(I), + /// Resolved ordering between the locale object and the subtags. + Ordering(Ordering), +} + +impl<I> SubtagOrderingResult<I> +where + I: Iterator, +{ + /// Invoke this function if there are no remaining locale objects to chain in order to get + /// a fully resolved [`Ordering`]. + #[inline] + pub fn end(self) -> Ordering { + match self { + Self::Subtags(mut it) => match it.next() { + Some(_) => Ordering::Less, + None => Ordering::Equal, + }, + Self::Ordering(o) => o, + } + } +} diff --git a/third_party/rust/icu_locid/src/parser/errors.rs b/third_party/rust/icu_locid/src/parser/errors.rs new file mode 100644 index 0000000000..b2262460c1 --- /dev/null +++ b/third_party/rust/icu_locid/src/parser/errors.rs @@ -0,0 +1,72 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use displaydoc::Display; + +/// List of parser errors that can be generated +/// while parsing [`LanguageIdentifier`](crate::LanguageIdentifier), [`Locale`](crate::Locale), +/// [`subtags`](crate::subtags) or [`extensions`](crate::extensions). +/// +/// Re-exported as [`Error`](crate::Error). +#[derive(Display, Debug, PartialEq, Copy, Clone)] +#[non_exhaustive] +pub enum ParserError { + /// Invalid language subtag. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Language; + /// use icu::locid::ParserError; + /// + /// assert_eq!("x2".parse::<Language>(), Err(ParserError::InvalidLanguage)); + /// ``` + #[displaydoc("The given language subtag is invalid")] + InvalidLanguage, + + /// Invalid script, region or variant subtag. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Region; + /// use icu::locid::ParserError; + /// + /// assert_eq!("#@2X".parse::<Region>(), Err(ParserError::InvalidSubtag)); + /// ``` + #[displaydoc("Invalid subtag")] + InvalidSubtag, + + /// Invalid extension subtag. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::extensions::unicode::Key; + /// use icu::locid::ParserError; + /// + /// assert_eq!("#@2X".parse::<Key>(), Err(ParserError::InvalidExtension)); + /// ``` + #[displaydoc("Invalid extension")] + InvalidExtension, + + /// Duplicated extension. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// use icu::locid::ParserError; + /// + /// assert_eq!( + /// "und-u-hc-h12-u-ca-calendar".parse::<Locale>(), + /// Err(ParserError::DuplicatedExtension) + /// ); + /// ``` + #[displaydoc("Duplicated extension")] + DuplicatedExtension, +} + +#[cfg(feature = "std")] +impl std::error::Error for ParserError {} diff --git a/third_party/rust/icu_locid/src/parser/langid.rs b/third_party/rust/icu_locid/src/parser/langid.rs new file mode 100644 index 0000000000..2c6ddeb037 --- /dev/null +++ b/third_party/rust/icu_locid/src/parser/langid.rs @@ -0,0 +1,278 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +pub use super::errors::ParserError; +use crate::extensions::unicode::{Attribute, Key, Value}; +use crate::extensions::ExtensionType; +use crate::helpers::ShortSlice; +use crate::parser::SubtagIterator; +use crate::LanguageIdentifier; +use crate::{extensions, subtags}; +use tinystr::TinyAsciiStr; + +#[derive(PartialEq, Clone, Copy)] +pub enum ParserMode { + LanguageIdentifier, + Locale, + Partial, +} + +#[derive(PartialEq, Clone, Copy)] +enum ParserPosition { + Script, + Region, + Variant, +} + +pub fn parse_language_identifier_from_iter( + iter: &mut SubtagIterator, + mode: ParserMode, +) -> Result<LanguageIdentifier, ParserError> { + let mut script = None; + let mut region = None; + let mut variants = ShortSlice::new(); + + let language = if let Some(subtag) = iter.next() { + subtags::Language::try_from_bytes(subtag)? + } else { + return Err(ParserError::InvalidLanguage); + }; + + let mut position = ParserPosition::Script; + + while let Some(subtag) = iter.peek() { + if mode != ParserMode::LanguageIdentifier && subtag.len() == 1 { + break; + } + + if position == ParserPosition::Script { + if let Ok(s) = subtags::Script::try_from_bytes(subtag) { + script = Some(s); + position = ParserPosition::Region; + } else if let Ok(s) = subtags::Region::try_from_bytes(subtag) { + region = Some(s); + position = ParserPosition::Variant; + } else if let Ok(v) = subtags::Variant::try_from_bytes(subtag) { + if let Err(idx) = variants.binary_search(&v) { + variants.insert(idx, v); + } + position = ParserPosition::Variant; + } else if mode == ParserMode::Partial { + break; + } else { + return Err(ParserError::InvalidSubtag); + } + } else if position == ParserPosition::Region { + if let Ok(s) = subtags::Region::try_from_bytes(subtag) { + region = Some(s); + position = ParserPosition::Variant; + } else if let Ok(v) = subtags::Variant::try_from_bytes(subtag) { + if let Err(idx) = variants.binary_search(&v) { + variants.insert(idx, v); + } + position = ParserPosition::Variant; + } else if mode == ParserMode::Partial { + break; + } else { + return Err(ParserError::InvalidSubtag); + } + } else if let Ok(v) = subtags::Variant::try_from_bytes(subtag) { + if let Err(idx) = variants.binary_search(&v) { + variants.insert(idx, v); + } else { + return Err(ParserError::InvalidSubtag); + } + } else if mode == ParserMode::Partial { + break; + } else { + return Err(ParserError::InvalidSubtag); + } + iter.next(); + } + + Ok(LanguageIdentifier { + language, + script, + region, + variants: subtags::Variants::from_short_slice_unchecked(variants), + }) +} + +pub fn parse_language_identifier( + t: &[u8], + mode: ParserMode, +) -> Result<LanguageIdentifier, ParserError> { + let mut iter = SubtagIterator::new(t); + parse_language_identifier_from_iter(&mut iter, mode) +} + +#[allow(clippy::type_complexity)] +pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter( + mut iter: SubtagIterator, + mode: ParserMode, +) -> Result< + ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Option<subtags::Variant>, + Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>, + ), + ParserError, +> { + let language; + let mut script = None; + let mut region = None; + let mut variant = None; + let mut keyword = None; + + if let (i, Some((start, end))) = iter.next_manual() { + iter = i; + match subtags::Language::try_from_bytes_manual_slice(iter.slice, start, end) { + Ok(l) => language = l, + Err(e) => return Err(e), + } + } else { + return Err(ParserError::InvalidLanguage); + } + + let mut position = ParserPosition::Script; + + while let Some((start, end)) = iter.peek_manual() { + if !matches!(mode, ParserMode::LanguageIdentifier) && end - start == 1 { + break; + } + + if matches!(position, ParserPosition::Script) { + if let Ok(s) = subtags::Script::try_from_bytes_manual_slice(iter.slice, start, end) { + script = Some(s); + position = ParserPosition::Region; + } else if let Ok(r) = + subtags::Region::try_from_bytes_manual_slice(iter.slice, start, end) + { + region = Some(r); + position = ParserPosition::Variant; + } else if let Ok(v) = + subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end) + { + // We cannot handle multiple variants in a const context + debug_assert!(variant.is_none()); + variant = Some(v); + position = ParserPosition::Variant; + } else if matches!(mode, ParserMode::Partial) { + break; + } else { + return Err(ParserError::InvalidSubtag); + } + } else if matches!(position, ParserPosition::Region) { + if let Ok(s) = subtags::Region::try_from_bytes_manual_slice(iter.slice, start, end) { + region = Some(s); + position = ParserPosition::Variant; + } else if let Ok(v) = + subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end) + { + // We cannot handle multiple variants in a const context + debug_assert!(variant.is_none()); + variant = Some(v); + position = ParserPosition::Variant; + } else if matches!(mode, ParserMode::Partial) { + break; + } else { + return Err(ParserError::InvalidSubtag); + } + } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end) + { + debug_assert!(matches!(position, ParserPosition::Variant)); + if variant.is_some() { + // We cannot handle multiple variants in a const context + return Err(ParserError::InvalidSubtag); + } + variant = Some(v); + } else if matches!(mode, ParserMode::Partial) { + break; + } else { + return Err(ParserError::InvalidSubtag); + } + + iter = iter.next_manual().0; + } + + if matches!(mode, ParserMode::Locale) { + if let Some((start, end)) = iter.peek_manual() { + match ExtensionType::try_from_bytes_manual_slice(iter.slice, start, end) { + Ok(ExtensionType::Unicode) => { + iter = iter.next_manual().0; + if let Some((start, end)) = iter.peek_manual() { + if Attribute::try_from_bytes_manual_slice(iter.slice, start, end).is_ok() { + // We cannot handle Attributes in a const context + return Err(ParserError::InvalidSubtag); + } + } + + let mut key = None; + let mut current_type = None; + + while let Some((start, end)) = iter.peek_manual() { + let slen = end - start; + if slen == 2 { + if key.is_some() { + // We cannot handle more than one Key in a const context + return Err(ParserError::InvalidSubtag); + } + match Key::try_from_bytes_manual_slice(iter.slice, start, end) { + Ok(k) => key = Some(k), + Err(e) => return Err(e), + }; + } else if key.is_some() { + match Value::parse_subtag_from_bytes_manual_slice( + iter.slice, start, end, + ) { + Ok(Some(t)) => { + if current_type.is_some() { + // We cannot handle more than one type in a const context + return Err(ParserError::InvalidSubtag); + } + current_type = Some(t); + } + Ok(None) => {} + Err(e) => return Err(e), + } + } else { + break; + } + iter = iter.next_manual().0 + } + if let Some(k) = key { + keyword = Some((k, current_type)); + } + } + // We cannot handle Transform, Private, Other extensions in a const context + Ok(_) => return Err(ParserError::InvalidSubtag), + Err(e) => return Err(e), + } + } + } + + Ok((language, script, region, variant, keyword)) +} + +#[allow(clippy::type_complexity)] +pub const fn parse_language_identifier_with_single_variant( + t: &[u8], + mode: ParserMode, +) -> Result< + ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Option<subtags::Variant>, + ), + ParserError, +> { + let iter = SubtagIterator::new(t); + match parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) { + Ok((l, s, r, v, _)) => Ok((l, s, r, v)), + Err(e) => Err(e), + } +} diff --git a/third_party/rust/icu_locid/src/parser/locale.rs b/third_party/rust/icu_locid/src/parser/locale.rs new file mode 100644 index 0000000000..175fd3a05b --- /dev/null +++ b/third_party/rust/icu_locid/src/parser/locale.rs @@ -0,0 +1,42 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use tinystr::TinyAsciiStr; + +use crate::extensions::{self, Extensions}; +use crate::parser::errors::ParserError; +use crate::parser::{parse_language_identifier_from_iter, ParserMode, SubtagIterator}; +use crate::{subtags, Locale}; + +use super::parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter; + +pub fn parse_locale(t: &[u8]) -> Result<Locale, ParserError> { + let mut iter = SubtagIterator::new(t); + + let id = parse_language_identifier_from_iter(&mut iter, ParserMode::Locale)?; + let extensions = if iter.peek().is_some() { + Extensions::try_from_iter(&mut iter)? + } else { + Extensions::default() + }; + Ok(Locale { id, extensions }) +} + +#[allow(clippy::type_complexity)] +pub const fn parse_locale_with_single_variant_single_keyword_unicode_keyword_extension( + t: &[u8], + mode: ParserMode, +) -> Result< + ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Option<subtags::Variant>, + Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>, + ), + ParserError, +> { + let iter = SubtagIterator::new(t); + parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) +} diff --git a/third_party/rust/icu_locid/src/parser/mod.rs b/third_party/rust/icu_locid/src/parser/mod.rs new file mode 100644 index 0000000000..4b02f71c9a --- /dev/null +++ b/third_party/rust/icu_locid/src/parser/mod.rs @@ -0,0 +1,231 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +pub mod errors; +mod langid; +mod locale; + +pub use errors::ParserError; +pub use langid::{ + parse_language_identifier, parse_language_identifier_from_iter, + parse_language_identifier_with_single_variant, + parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter, ParserMode, +}; + +pub use locale::{ + parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, +}; + +#[inline] +const fn is_separator(slice: &[u8], idx: usize) -> bool { + #[allow(clippy::indexing_slicing)] + let b = slice[idx]; + b == b'-' || b == b'_' +} + +const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) { + debug_assert!(idx < slice.len()); + + // This function is called only on the idx == 0 or on a separator. + let (start, mut end) = if is_separator(slice, idx) { + // If it's a separator, set the start to idx+1 and advance the idx to the next char. + (idx + 1, idx + 1) + } else { + // If it's idx=0, start is 0 and end is set to 1 + debug_assert!(idx == 0); + (0, 1) + }; + + while end < slice.len() && !is_separator(slice, end) { + // Advance until we reach end of slice or a separator. + end += 1; + } + // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"` + (start, end) +} + +// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing. +// +// It is quite extraordinary due to focus on performance and Rust limitations for `const` +// functions. +// +// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`, +// `"en-"` etc. +// +// The iterator provides methods available for static users - `next_manual` and `peek_manual`, +// as well as typical `Peekable` iterator APIs - `next` and `peek`. +// +// All methods return an `Option` of a `Result`. +#[derive(Copy, Clone, Debug)] +pub struct SubtagIterator<'a> { + pub slice: &'a [u8], + done: bool, + // done + subtag is faster than Option<(usize, usize)> + // at the time of writing. + subtag: (usize, usize), +} + +impl<'a> SubtagIterator<'a> { + pub const fn new(slice: &'a [u8]) -> Self { + let subtag = if slice.is_empty() || is_separator(slice, 0) { + // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"` + (0, 0) + } else { + get_current_subtag(slice, 0) + }; + Self { + slice, + done: false, + subtag, + } + } + + pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) { + if self.done { + return (self, None); + } + let result = self.subtag; + if result.1 < self.slice.len() { + self.subtag = get_current_subtag(self.slice, result.1); + } else { + self.done = true; + } + (self, Some(result)) + } + + pub const fn peek_manual(&self) -> Option<(usize, usize)> { + if self.done { + return None; + } + Some(self.subtag) + } + + pub fn peek(&self) -> Option<&'a [u8]> { + #[allow(clippy::indexing_slicing)] // peek_manual returns valid indices + self.peek_manual().map(|(s, e)| &self.slice[s..e]) + } +} + +impl<'a> Iterator for SubtagIterator<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option<Self::Item> { + let (s, res) = self.next_manual(); + *self = s; + #[allow(clippy::indexing_slicing)] // next_manual returns valid indices + res.map(|(s, e)| &self.slice[s..e]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn slice_to_str(input: &[u8]) -> &str { + std::str::from_utf8(input).unwrap() + } + + #[test] + fn subtag_iterator_peek_test() { + let slice = "de_at-u-ca-foobar"; + let mut si = SubtagIterator::new(slice.as_bytes()); + + assert_eq!(si.peek().map(slice_to_str), Some("de")); + assert_eq!(si.peek().map(slice_to_str), Some("de")); + assert_eq!(si.next().map(slice_to_str), Some("de")); + + assert_eq!(si.peek().map(slice_to_str), Some("at")); + assert_eq!(si.peek().map(slice_to_str), Some("at")); + assert_eq!(si.next().map(slice_to_str), Some("at")); + } + + #[test] + fn subtag_iterator_test() { + let slice = ""; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + + let slice = "-"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + + let slice = "-en"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("en")); + assert_eq!(si.next(), None); + + let slice = "en"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]); + + let slice = "en-"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]); + + let slice = "--"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next(), None); + + let slice = "-en-"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("en")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next(), None); + + let slice = "de_at-u-ca-foobar"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!( + si.map(slice_to_str).collect::<Vec<_>>(), + vec!["de", "at", "u", "ca", "foobar",] + ); + } + + #[test] + fn get_current_subtag_test() { + let slice = "-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + + let slice = "-en"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 3)); + + let slice = "-en-"; + let current = get_current_subtag(slice.as_bytes(), 3); + assert_eq!(current, (4, 4)); + + let slice = "en-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (0, 2)); + + let current = get_current_subtag(slice.as_bytes(), 2); + assert_eq!(current, (3, 3)); + + let slice = "en--US"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (0, 2)); + + let current = get_current_subtag(slice.as_bytes(), 2); + assert_eq!(current, (3, 3)); + + let current = get_current_subtag(slice.as_bytes(), 3); + assert_eq!(current, (4, 6)); + + let slice = "--"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + + let current = get_current_subtag(slice.as_bytes(), 1); + assert_eq!(current, (2, 2)); + + let slice = "-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + } +} diff --git a/third_party/rust/icu_locid/src/serde.rs b/third_party/rust/icu_locid/src/serde.rs new file mode 100644 index 0000000000..3bfe303887 --- /dev/null +++ b/third_party/rust/icu_locid/src/serde.rs @@ -0,0 +1,135 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::LanguageIdentifier; +use alloc::string::ToString; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +impl Serialize for LanguageIdentifier { + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> + where + S: Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + +impl<'de> Deserialize<'de> for LanguageIdentifier { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: Deserializer<'de>, + { + struct LanguageIdentifierVisitor; + + impl<'de> serde::de::Visitor<'de> for LanguageIdentifierVisitor { + type Value = LanguageIdentifier; + + fn expecting(&self, formatter: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(formatter, "a valid Unicode Language Identifier") + } + + fn visit_str<E>(self, s: &str) -> Result<Self::Value, E> + where + E: serde::de::Error, + { + s.parse::<LanguageIdentifier>() + .map_err(serde::de::Error::custom) + } + } + + deserializer.deserialize_string(LanguageIdentifierVisitor) + } +} + +#[test] +fn json() { + use crate::langid; + use crate::subtags::{Language, Region, Script}; + + assert_eq!( + serde_json::to_string(&langid!("en-US")).unwrap(), + r#""en-US""# + ); + assert_eq!( + serde_json::from_str::<LanguageIdentifier>(r#""en-US""#).unwrap(), + langid!("en-US") + ); + assert!(serde_json::from_str::<LanguageIdentifier>(r#""2Xs""#).is_err()); + + assert_eq!( + serde_json::to_string(&"fr".parse::<Language>().unwrap()).unwrap(), + r#""fr""# + ); + assert_eq!( + serde_json::from_str::<Language>(r#""fr""#).unwrap(), + "fr".parse::<Language>().unwrap() + ); + assert!(serde_json::from_str::<Language>(r#""2Xs""#).is_err()); + + assert_eq!( + serde_json::to_string(&"Latn".parse::<Script>().unwrap()).unwrap(), + r#""Latn""# + ); + assert_eq!( + serde_json::from_str::<Script>(r#""Latn""#).unwrap(), + "Latn".parse::<Script>().unwrap() + ); + assert!(serde_json::from_str::<Script>(r#""2Xs""#).is_err()); + + assert_eq!( + serde_json::to_string(&"US".parse::<Region>().unwrap()).unwrap(), + r#""US""# + ); + assert_eq!( + serde_json::from_str::<Region>(r#""US""#).unwrap(), + "US".parse::<Region>().unwrap() + ); + assert!(serde_json::from_str::<Region>(r#""2Xs""#).is_err()); +} + +#[test] +fn postcard() { + use crate::langid; + use crate::subtags::{Language, Region, Script}; + + assert_eq!( + postcard::to_stdvec(&langid!("en-US")).unwrap(), + &[5, b'e', b'n', b'-', b'U', b'S'] + ); + assert_eq!( + postcard::from_bytes::<LanguageIdentifier>(&[5, b'e', b'n', b'-', b'U', b'S']).unwrap(), + langid!("en-US") + ); + assert!(postcard::from_bytes::<LanguageIdentifier>(&[3, b'2', b'X', b's']).is_err()); + + assert_eq!( + postcard::to_stdvec(&"fr".parse::<Language>().unwrap()).unwrap(), + b"fr\0" + ); + assert_eq!( + postcard::from_bytes::<Language>(b"fr\0").unwrap(), + "fr".parse::<Language>().unwrap() + ); + assert!(postcard::from_bytes::<Language>(b"2Xs").is_err()); + + assert_eq!( + postcard::to_stdvec(&"Latn".parse::<Script>().unwrap()).unwrap(), + b"Latn" + ); + assert_eq!( + postcard::from_bytes::<Script>(b"Latn").unwrap(), + "Latn".parse::<Script>().unwrap() + ); + assert!(postcard::from_bytes::<Script>(b"2Xss").is_err()); + + assert_eq!( + postcard::to_stdvec(&"US".parse::<Region>().unwrap()).unwrap(), + b"US\0" + ); + assert_eq!( + postcard::from_bytes::<Region>(b"US\0").unwrap(), + "US".parse::<Region>().unwrap() + ); + assert!(postcard::from_bytes::<Region>(b"2Xs").is_err()); +} diff --git a/third_party/rust/icu_locid/src/subtags/language.rs b/third_party/rust/icu_locid/src/subtags/language.rs new file mode 100644 index 0000000000..6fd08a2d5f --- /dev/null +++ b/third_party/rust/icu_locid/src/subtags/language.rs @@ -0,0 +1,107 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A language subtag (examples: `"en"`, `"csb"`, `"zh"`, `"und"`, etc.) + /// + /// [`Language`] represents a Unicode base language code conformant to the + /// [`unicode_language_id`] field of the Language and Locale Identifier. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Language; + /// + /// let language: Language = + /// "en".parse().expect("Failed to parse a language subtag."); + /// ``` + /// + /// If the [`Language`] has no value assigned, it serializes to a string `"und"`, which + /// can be then parsed back to an empty [`Language`] field. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Language; + /// + /// assert_eq!(Language::default().as_str(), "und"); + /// ``` + /// + /// `Notice`: ICU4X uses a narrow form of language subtag of 2-3 characters. + /// The specification allows language subtag to optionally also be 5-8 characters + /// but that form has not been used and ICU4X does not support it right now. + /// + /// [`unicode_language_id`]: https://unicode.org/reports/tr35/#unicode_language_id + Language, + subtags, + language, + subtags_language, + 2..=3, + s, + s.is_ascii_alphabetic(), + s.to_ascii_lowercase(), + s.is_ascii_alphabetic_lowercase(), + InvalidLanguage, + ["en", "foo"], + ["419", "german", "en1"], +); + +impl Language { + /// The default undefined language "und". Same as [`default()`](Default::default()). + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Language; + /// + /// assert_eq!(Language::default(), Language::UND); + /// ``` + pub const UND: Self = unsafe { Self::from_raw_unchecked(*b"und") }; + + /// Resets the [`Language`] subtag to an empty one (equal to `"und"`). + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::{language, Language}; + /// + /// let mut lang = language!("csb"); + /// + /// assert_ne!(lang, Language::UND); + /// + /// lang.clear(); + /// + /// assert_eq!(lang, Language::UND); + /// ``` + #[inline] + pub fn clear(&mut self) { + *self = Self::UND + } + + /// Tests if the [`Language`] subtag is empty (equal to `"und"`). + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Language; + /// + /// let mut lang = Language::UND; + /// + /// assert!(lang.is_empty()); + /// + /// lang.clear(); + /// + /// assert!(lang.is_empty()); + /// ``` + #[inline] + pub fn is_empty(self) -> bool { + self == Self::UND + } +} + +impl Default for Language { + fn default() -> Language { + Language::UND + } +} diff --git a/third_party/rust/icu_locid/src/subtags/mod.rs b/third_party/rust/icu_locid/src/subtags/mod.rs new file mode 100644 index 0000000000..9cc04dac8c --- /dev/null +++ b/third_party/rust/icu_locid/src/subtags/mod.rs @@ -0,0 +1,62 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Language Identifier and Locale contains a set of subtags +//! which represent different fields of the structure. +//! +//! * [`Language`] is the only mandatory field, which when empty, +//! takes the value `und`. +//! * [`Script`] is an optional field representing the written script used by the locale. +//! * [`Region`] is the region used by the locale. +//! * [`Variants`] is a list of optional [`Variant`] subtags containing information about the +//! variant adjustments used by the locale. +//! +//! Subtags can be used in isolation, and all basic operations such as parsing, syntax canonicalization +//! and serialization are supported on each individual subtag, but most commonly +//! they are used to construct a [`LanguageIdentifier`] instance. +//! +//! [`Variants`] is a special structure which contains a list of [`Variant`] subtags. +//! It is wrapped around to allow for sorting and deduplication of variants, which +//! is one of the required steps of language identifier and locale syntax canonicalization. +//! +//! # Examples +//! +//! ``` +//! use icu::locid::subtags::{Language, Region, Script, Variant}; +//! +//! let language: Language = +//! "en".parse().expect("Failed to parse a language subtag."); +//! let script: Script = +//! "arab".parse().expect("Failed to parse a script subtag."); +//! let region: Region = +//! "cn".parse().expect("Failed to parse a region subtag."); +//! let variant: Variant = +//! "MacOS".parse().expect("Failed to parse a variant subtag."); +//! +//! assert_eq!(language.as_str(), "en"); +//! assert_eq!(script.as_str(), "Arab"); +//! assert_eq!(region.as_str(), "CN"); +//! assert_eq!(variant.as_str(), "macos"); +//! ``` +//! +//! `Notice`: The subtags are canonicalized on parsing. That means +//! that all operations work on a canonicalized version of the subtag +//! and serialization is very cheap. +//! +//! [`LanguageIdentifier`]: super::LanguageIdentifier +mod language; +mod region; +mod script; +mod variant; +mod variants; + +#[doc(inline)] +pub use language::{language, Language}; +#[doc(inline)] +pub use region::{region, Region}; +#[doc(inline)] +pub use script::{script, Script}; +#[doc(inline)] +pub use variant::{variant, Variant}; +pub use variants::Variants; diff --git a/third_party/rust/icu_locid/src/subtags/region.rs b/third_party/rust/icu_locid/src/subtags/region.rs new file mode 100644 index 0000000000..4348f15e79 --- /dev/null +++ b/third_party/rust/icu_locid/src/subtags/region.rs @@ -0,0 +1,62 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A region subtag (examples: `"US"`, `"CN"`, `"AR"` etc.) + /// + /// [`Region`] represents a Unicode base language code conformant to the + /// [`unicode_region_id`] field of the Language and Locale Identifier. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Region; + /// + /// let region: Region = + /// "DE".parse().expect("Failed to parse a region subtag."); + /// ``` + /// + /// [`unicode_region_id`]: https://unicode.org/reports/tr35/#unicode_region_id + Region, + subtags, + region, + subtags_region, + 2..=3, + s, + if s.len() == 2 { + s.is_ascii_alphabetic() + } else { + s.is_ascii_numeric() + }, + if s.len() == 2 { + s.to_ascii_uppercase() + } else { + s + }, + if s.len() == 2 { + s.is_ascii_alphabetic_uppercase() + } else { + s.is_ascii_numeric() + }, + InvalidSubtag, + ["FR", "123"], + ["12", "FRA", "b2"], +); + +impl Region { + /// Returns true if the Region has an alphabetic code. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Region; + /// + /// let region = Region::try_from_bytes(b"us").expect("Parsing failed."); + /// + /// assert!(region.is_alphabetic()); + /// ``` + pub fn is_alphabetic(&self) -> bool { + self.0.len() == 2 + } +} diff --git a/third_party/rust/icu_locid/src/subtags/script.rs b/third_party/rust/icu_locid/src/subtags/script.rs new file mode 100644 index 0000000000..79ead0390c --- /dev/null +++ b/third_party/rust/icu_locid/src/subtags/script.rs @@ -0,0 +1,33 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A script subtag (examples: `"Latn"`, `"Arab"`, etc.) + /// + /// [`Script`] represents a Unicode base language code conformant to the + /// [`unicode_script_id`] field of the Language and Locale Identifier. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Script; + /// + /// let script: Script = + /// "Latn".parse().expect("Failed to parse a script subtag."); + /// ``` + /// + /// [`unicode_script_id`]: https://unicode.org/reports/tr35/#unicode_script_id + Script, + subtags, + script, + subtags_script, + 4..=4, + s, + s.is_ascii_alphabetic(), + s.to_ascii_titlecase(), + s.is_ascii_alphabetic_titlecase(), + InvalidSubtag, + ["Latn"], + ["Latin"], +); diff --git a/third_party/rust/icu_locid/src/subtags/variant.rs b/third_party/rust/icu_locid/src/subtags/variant.rs new file mode 100644 index 0000000000..c60b138659 --- /dev/null +++ b/third_party/rust/icu_locid/src/subtags/variant.rs @@ -0,0 +1,35 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +impl_tinystr_subtag!( + /// A variant subtag (examples: `"macos"`, `"posix"`, `"1996"` etc.) + /// + /// [`Variant`] represents a Unicode base language code conformant to the + /// [`unicode_variant_id`] field of the Language and Locale Identifier. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Variant; + /// + /// let variant: Variant = + /// "macos".parse().expect("Failed to parse a variant subtag."); + /// ``` + /// + /// [`unicode_variant_id`]: https://unicode.org/reports/tr35/#unicode_variant_id + Variant, + subtags, + variant, + subtags_variant, + 4..=8, + s, + s.is_ascii_alphanumeric() && (s.len() != 4 || s.all_bytes()[0].is_ascii_digit()), + s.to_ascii_lowercase(), + s.is_ascii_lowercase() + && s.is_ascii_alphanumeric() + && (s.len() != 4 || s.all_bytes()[0].is_ascii_digit()), + InvalidSubtag, + ["posix", "1996"], + ["yes"], +); diff --git a/third_party/rust/icu_locid/src/subtags/variants.rs b/third_party/rust/icu_locid/src/subtags/variants.rs new file mode 100644 index 0000000000..ba5ff1bc1a --- /dev/null +++ b/third_party/rust/icu_locid/src/subtags/variants.rs @@ -0,0 +1,128 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::Variant; +use crate::helpers::ShortSlice; + +use alloc::vec::Vec; +use core::ops::Deref; + +/// A list of variants (examples: `["macos", "posix"]`, etc.) +/// +/// [`Variants`] stores a list of [`Variant`] subtags in a canonical form +/// by sorting and deduplicating them. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::subtags::{variant, Variants}; +/// +/// let mut v = vec![variant!("posix"), variant!("macos")]; +/// v.sort(); +/// v.dedup(); +/// +/// let variants: Variants = Variants::from_vec_unchecked(v); +/// assert_eq!(variants.to_string(), "macos-posix"); +/// ``` +#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)] +pub struct Variants(ShortSlice<Variant>); + +impl Variants { + /// Returns a new empty list of variants. Same as [`default()`](Default::default()), but is `const`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::Variants; + /// + /// assert_eq!(Variants::new(), Variants::default()); + /// ``` + #[inline] + pub const fn new() -> Self { + Self(ShortSlice::new()) + } + + /// Creates a new [`Variants`] set from a single [`Variant`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::{variant, Variants}; + /// + /// let variants = Variants::from_variant(variant!("posix")); + /// ``` + #[inline] + pub const fn from_variant(variant: Variant) -> Self { + Self(ShortSlice::new_single(variant)) + } + + /// Creates a new [`Variants`] set from a [`Vec`]. + /// The caller is expected to provide sorted and deduplicated vector as + /// an input. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::{variant, Variants}; + /// + /// let mut v = vec![variant!("posix"), variant!("macos")]; + /// v.sort(); + /// v.dedup(); + /// + /// let variants = Variants::from_vec_unchecked(v); + /// ``` + /// + /// Notice: For performance- and memory-constrained environments, it is recommended + /// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort) + /// and [`dedup`](Vec::dedup()). + pub fn from_vec_unchecked(input: Vec<Variant>) -> Self { + Self(input.into()) + } + + pub(crate) fn from_short_slice_unchecked(input: ShortSlice<Variant>) -> Self { + Self(input) + } + + /// Empties the [`Variants`] list. + /// + /// Returns the old list. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::subtags::{variant, Variants}; + /// + /// let mut v = vec![variant!("posix"), variant!("macos")]; + /// v.sort(); + /// v.dedup(); + /// + /// let mut variants: Variants = Variants::from_vec_unchecked(v); + /// + /// assert_eq!(variants.to_string(), "macos-posix"); + /// + /// variants.clear(); + /// + /// assert_eq!(variants, Variants::default()); + /// ``` + pub fn clear(&mut self) -> Self { + core::mem::take(self) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + self.deref().iter().map(|t| t.as_str()).try_for_each(f) + } +} + +impl_writeable_for_subtag_list!(Variants, "macos", "posix"); + +impl Deref for Variants { + type Target = [Variant]; + + fn deref(&self) -> &[Variant] { + self.0.deref() + } +} diff --git a/third_party/rust/icu_locid/src/zerovec.rs b/third_party/rust/icu_locid/src/zerovec.rs new file mode 100644 index 0000000000..ba6a3e85d6 --- /dev/null +++ b/third_party/rust/icu_locid/src/zerovec.rs @@ -0,0 +1,132 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Documentation on zero-copy deserialization of locale types. +//! +//! [`Locale`] and [`LanguageIdentifier`] are highly structured types that cannot be directly +//! stored in a zero-copy data structure, such as those provided by the [`zerovec`] crate. +//! This page explains how to indirectly store these types in a [`zerovec`]. +//! +//! There are two main use cases, which have different solutions: +//! +//! 1. **Lookup:** You need to locate a locale in a zero-copy vector, such as when querying a map. +//! 2. **Obtain:** You have a locale stored in a zero-copy vector, and you need to obtain a proper +//! [`Locale`] or [`LanguageIdentifier`] for use elsewhere in your program. +//! +//! # Lookup +//! +//! To perform lookup, store the stringified locale in a canonical BCP-47 form as a byte array, +//! and then use [`Locale::strict_cmp()`] to perform an efficient, zero-allocation lookup. +//! +//! To produce more human-readable serialized output, you can use [`UnvalidatedStr`]. +//! +//! ``` +//! use icu_locid::Locale; +//! use zerovec::ule::UnvalidatedStr; +//! use zerovec::ZeroMap; +//! +//! // ZeroMap from locales to integers +//! let data: &[(&UnvalidatedStr, u32)] = &[ +//! ("de-DE-u-hc-h12".into(), 5), +//! ("en-US-u-ca-buddhist".into(), 10), +//! ("my-MM".into(), 15), +//! ("sr-Cyrl-ME".into(), 20), +//! ("zh-TW".into(), 25), +//! ]; +//! let zm: ZeroMap<UnvalidatedStr, u32> = data.iter().copied().collect(); +//! +//! // Get the value associated with a locale +//! let loc: Locale = "en-US-u-ca-buddhist".parse().unwrap(); +//! let value = zm.get_copied_by(|uvstr| loc.strict_cmp(uvstr).reverse()); +//! assert_eq!(value, Some(10)); +//! ``` +//! +//! # Obtain +//! +//! Obtaining a [`Locale`] or [`LanguageIdentifier`] is not generally a zero-copy operation, since +//! both of these types may require memory allocation. If possible, architect your code such that +//! you do not need to obtain a structured type. +//! +//! If you need the structured type, such as if you need to manipulate it in some way, there are two +//! options: storing subtags, and storing a string for parsing. +//! +//! ## Storing Subtags +//! +//! If the data being stored only contains a limited number of subtags, you can store them as a +//! tuple, and then construct the [`LanguageIdentifier`] externally. +//! +//! ``` +//! use icu_locid::subtags::{Language, Region, Script}; +//! use icu_locid::LanguageIdentifier; +//! use icu_locid::{ +//! langid, +//! subtags::{language, region, script}, +//! }; +//! use zerovec::ZeroMap; +//! +//! // ZeroMap from integer to LSR (language-script-region) +//! let zm: ZeroMap<u32, (Language, Option<Script>, Option<Region>)> = [ +//! (5, (language!("de"), None, Some(region!("DE")))), +//! (10, (language!("en"), None, Some(region!("US")))), +//! (15, (language!("my"), None, Some(region!("MM")))), +//! ( +//! 20, +//! (language!("sr"), Some(script!("Cyrl")), Some(region!("ME"))), +//! ), +//! (25, (language!("zh"), None, Some(region!("TW")))), +//! ] +//! .into_iter() +//! .collect(); +//! +//! // Construct a LanguageIdentifier from a tuple entry +//! let lid: LanguageIdentifier = +//! zm.get_copied(&25).expect("element is present").into(); +//! +//! assert_eq!(lid, langid!("zh-TW")); +//! ``` +//! +//! ## Storing Strings +//! +//! If it is necessary to store and obtain an arbitrary locale, it is currently recommended to +//! store a BCP-47 string and parse it when needed. +//! +//! Since the string is stored in an unparsed state, it is not safe to `unwrap` the result from +//! `Locale::try_from_bytes()`. See [icu4x#831](https://github.com/unicode-org/icu4x/issues/831) +//! for a discussion on potential data models that could ensure that the locale is valid during +//! deserialization. +//! +//! As above, to produce more human-readable serialized output, you can use [`UnvalidatedStr`]. +//! +//! ``` +//! use icu_locid::langid; +//! use icu_locid::Locale; +//! use zerovec::ule::UnvalidatedStr; +//! use zerovec::ZeroMap; +//! +//! // ZeroMap from integer to locale string +//! let data: &[(u32, &UnvalidatedStr)] = &[ +//! (5, "de-DE-u-hc-h12".into()), +//! (10, "en-US-u-ca-buddhist".into()), +//! (15, "my-MM".into()), +//! (20, "sr-Cyrl-ME".into()), +//! (25, "zh-TW".into()), +//! (30, "INVALID".into()), +//! ]; +//! let zm: ZeroMap<u32, UnvalidatedStr> = data.iter().copied().collect(); +//! +//! // Construct a Locale by parsing the string. +//! let value = zm.get(&25).expect("element is present"); +//! let loc = Locale::try_from_bytes(value); +//! assert_eq!(loc, Ok(langid!("zh-TW").into())); +//! +//! // Invalid entries are fallible +//! let err_value = zm.get(&30).expect("element is present"); +//! let err_loc = Locale::try_from_bytes(err_value); +//! assert!(matches!(err_loc, Err(_))); +//! ``` +//! +//! [`Locale`]: crate::Locale +//! [`Locale::strict_cmp()`]: crate::Locale::strict_cmp() +//! [`LanguageIdentifier`]: crate::LanguageIdentifier +//! [`UnvalidatedStr`]: zerovec::ule::UnvalidatedStr diff --git a/third_party/rust/icu_locid/tests/fixtures/canonicalize.json b/third_party/rust/icu_locid/tests/fixtures/canonicalize.json new file mode 100644 index 0000000000..79a5057146 --- /dev/null +++ b/third_party/rust/icu_locid/tests/fixtures/canonicalize.json @@ -0,0 +1,68 @@ +[ + { + "input": "Pl", + "output": "pl" + }, + { + "input": "eN-uS", + "output": "en-US" + }, + { + "input": "ZH_hans_hK", + "output": "zh-Hans-HK" + }, + { + "input": "en-scouse-fonipa", + "output": "en-fonipa-scouse" + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-t-es-AR-x-foo" + }, + "output": { + "type": "Locale", + "identifier": "en-US-t-es-ar-x-foo" + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-t-en-Latn-CA-emodeng" + }, + "output": { + "type": "Locale", + "identifier": "en-t-en-latn-ca-emodeng" + } + }, + { + "input": { + "type": "Locale", + "identifier": "EN-US-T-ES-AR-X-FOO" + }, + "output": { + "type": "Locale", + "identifier": "en-US-t-es-ar-x-foo" + } + }, + { + "input": { + "type": "Locale", + "identifier": "EN-T-EN-LATN-CA-EMODENG" + }, + "output": { + "type": "Locale", + "identifier": "en-t-en-latn-ca-emodeng" + } + }, + { + "input": { + "type": "Locale", + "identifier": "UND-CYRL-T-ES-LATN-M0-UNGEGN" + }, + "output": { + "type": "Locale", + "identifier": "und-Cyrl-t-es-latn-m0-ungegn" + } + } +] diff --git a/third_party/rust/icu_locid/tests/fixtures/invalid-extensions.json b/third_party/rust/icu_locid/tests/fixtures/invalid-extensions.json new file mode 100644 index 0000000000..3aff2636b2 --- /dev/null +++ b/third_party/rust/icu_locid/tests/fixtures/invalid-extensions.json @@ -0,0 +1,152 @@ +[ + { + "input": { + "type": "Locale", + "identifier": "pl-US-x-waytoolongkey" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pl-US-x-@A_3" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pl-US-t-h0" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pl-US-t-h0-x-foo" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pl-US-t-h0" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-t-m0" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-t-m0-n0-mixed" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "da-u" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "da-u--" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "da-u-t-latn" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "cmn-hans-cn-u-u" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "de-u-ca-" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "de-u-ca-gregory-" + }, + "output": { + "error": "InvalidExtension", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "de-u-ca-gregory-u-hc-hc24" + }, + "output": { + "error": "DuplicatedExtension", + "text": "Duplicated extension" + } + }, + { + "input": { + "type": "Locale", + "identifier": "de-l-foo-l-bar" + }, + "output": { + "error": "DuplicatedExtension", + "text": "Duplicated extension" + } + } +] diff --git a/third_party/rust/icu_locid/tests/fixtures/invalid.json b/third_party/rust/icu_locid/tests/fixtures/invalid.json new file mode 100644 index 0000000000..c22459e65d --- /dev/null +++ b/third_party/rust/icu_locid/tests/fixtures/invalid.json @@ -0,0 +1,134 @@ +[ + { + "input": "-", + "output": { + "error": "InvalidLanguage", + "text": "The given language subtag is invalid" + } + }, + { + "input": "--", + "output": { + "error": "InvalidLanguage", + "text": "The given subtag is invalid" + } + }, + { + "input": "en-", + "output": { + "error": "InvalidSubtag", + "text": "The given subtag is invalid" + } + }, + { + "input": "-en", + "output": { + "error": "InvalidLanguage", + "text": "The given subtag is invalid" + } + }, + { + "input": "en-us-", + "output": { + "error": "InvalidSubtag", + "text": "The given subtag is invalid" + } + }, + { + "input": "en--US", + "output": { + "error": "InvalidSubtag", + "text": "The given subtag is invalid" + } + }, + { + "input": "-e-", + "output": { + "error": "InvalidLanguage", + "text": "The given subtag is invalid" + } + }, + { + "input": "a1a", + "output": { + "error": "InvalidLanguage", + "text": "The given language subtag is invalid" + } + }, + { + "input": "Arab-US", + "output": { + "error": "InvalidLanguage", + "text": "The given language subtag is invalid" + } + }, + { + "input": "", + "output": { + "error": "InvalidLanguage", + "text": "The given language subtag is invalid" + } + }, + { + "input": "pl-DSDAFAFDF", + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": "pl-Latn-$1231", + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": "pl-Latn-US-$1231", + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": "pl-Latn-12", + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": "pl-Latn-a12", + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": "pl-Latn-US-3_dd", + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pl-Latn-US-variant-h0-hybrid" + }, + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-variant-emodeng-emodeng" + }, + "output": { + "error": "InvalidSubtag", + "text": "Invalid subtag" + } + } +] diff --git a/third_party/rust/icu_locid/tests/fixtures/langid.json b/third_party/rust/icu_locid/tests/fixtures/langid.json new file mode 100644 index 0000000000..31740d99aa --- /dev/null +++ b/third_party/rust/icu_locid/tests/fixtures/langid.json @@ -0,0 +1,167 @@ +[ + { + "input": "en", + "output": { + "type": "LanguageIdentifier", + "language": "en" + } + }, + { + "input": "lij", + "output": { + "type": "LanguageIdentifier", + "language": "lij" + } + }, + { + "input": "en-Latn", + "output": { + "type": "LanguageIdentifier", + "language": "en", + "script": "Latn" + } + }, + { + "input": "lij-Arab", + "output": { + "type": "LanguageIdentifier", + "language": "lij", + "script": "Arab" + } + }, + { + "input": "en-Latn-US", + "output": { + "type": "LanguageIdentifier", + "language": "en", + "script": "Latn", + "region": "US" + } + }, + { + "input": "lij-Arab-FA", + "output": { + "type": "LanguageIdentifier", + "language": "lij", + "script": "Arab", + "region": "FA" + } + }, + { + "input": "en-Latn-US-windows", + "output": { + "type": "LanguageIdentifier", + "language": "en", + "script": "Latn", + "region": "US", + "variants": ["windows"] + } + }, + { + "input": "lij-Arab-FA-linux", + "output": { + "type": "LanguageIdentifier", + "language": "lij", + "script": "Arab", + "region": "FA", + "variants": ["linux"] + } + }, + { + "input": "lij-Arab-FA-linux-nedis", + "output": { + "type": "LanguageIdentifier", + "language": "lij", + "script": "Arab", + "region": "FA", + "variants": ["linux", "nedis"] + } + }, + { + "input": "EN-latn-us", + "output": { + "type": "LanguageIdentifier", + "language": "en", + "script": "Latn", + "region": "US" + } + }, + { + "input": "sl-nedis", + "output": { + "type": "LanguageIdentifier", + "language": "sl", + "variants": ["nedis"] + } + }, + { + "input": "de-CH-1996", + "output": { + "type": "LanguageIdentifier", + "language": "de", + "region": "CH", + "variants": ["1996"] + } + }, + { + "input": "sr-Latn", + "output": { + "type": "LanguageIdentifier", + "language": "sr", + "script": "Latn" + } + }, + { + "input": "es-419", + "output": { + "type": "LanguageIdentifier", + "language": "es", + "region": "419" + } + }, + { + "input": "und-Latn-US", + "output": { + "type": "LanguageIdentifier", + "script": "Latn", + "region": "US" + } + }, + { + "input": "und", + "output": { + "type": "LanguageIdentifier" + } + }, + { + "input": "und-Latn", + "output": { + "type": "LanguageIdentifier", + "script": "Latn" + } + }, + { + "input": "pl-macos-Windows-nedis-aRabic", + "output": { + "type": "LanguageIdentifier", + "language": "pl", + "variants": ["arabic", "macos", "nedis", "windows"] + } + }, + { + "input": "und-Latn-macos", + "output": { + "type": "LanguageIdentifier", + "script": "Latn", + "variants": ["macos"] + } + }, + { + "input": "und-Latn-312", + "output": { + "type": "LanguageIdentifier", + "script": "Latn", + "region": "312" + } + } +] diff --git a/third_party/rust/icu_locid/tests/fixtures/locale.json b/third_party/rust/icu_locid/tests/fixtures/locale.json new file mode 100644 index 0000000000..93679a0667 --- /dev/null +++ b/third_party/rust/icu_locid/tests/fixtures/locale.json @@ -0,0 +1,298 @@ +[ + { + "input": { + "type": "Locale", + "identifier": "en-u-hc-h12" + }, + "output": { + "type": "Locale", + "language": "en", + "extensions": { + "unicode": { + "keywords": { + "hc": "h12" + } + } + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-u-hc-h23" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "unicode": { + "keywords": { + "hc": "h23" + } + } + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-u-foo" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "unicode": { + "attributes": [ + "foo" + ] + } + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-u-hc-h23-ca-islamic-civil-ss-true" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "unicode": { + "keywords": { + "hc": "h23", + "ca": "islamic-civil", + "ss": "true" + } + } + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-t-pl-latn-de" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "transform": { + "tlang": "pl-Latn-DE" + } + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-x-private-foobar" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "private": ["private", "foobar"] + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-t-h0-hybrid-k0-platform-s0-true" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "transform": { + "tfields": { + "h0": "hybrid", + "k0": "platform", + "s0": "true" + } + } + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-t-es-ar-x-foo" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "transform": { + "tlang": "es-AR" + }, + "private": ["foo"] + } + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-US-u-ca-buddhist-hc-h12-t-es-ar-h0-hybrid-x-private-foobar" + }, + "output": { + "type": "Locale", + "language": "en", + "region": "US", + "extensions": { + "unicode": { + "keywords": { + "ca": "buddhist", + "hc": "h12" + } + }, + "transform": { + "tlang": "es-AR", + "tfields": { + "h0": "hybrid" + } + }, + "private": ["private", "foobar"] + } + } + }, + { + "input": { + "type": "Locale", + "language": "es", + "region": "MX", + "extensions": { + "unicode": { + "keywords": { + "ca": "islamic", + "co": "search", + "nu": "roman" + } + } + } + }, + "output": { + "type": "Locale", + "identifier": "es-MX-u-ca-islamic-co-search-nu-roman" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-u-kn" + }, + "output": { + "type": "Locale", + "identifier": "und-u-kn" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-u-kn-ca-calendar" + }, + "output": { + "type": "Locale", + "identifier": "und-u-ca-calendar-kn" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-u-kn-nu-arab" + }, + "output": { + "type": "Locale", + "identifier": "und-u-kn-nu-arab" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-t-m0-true" + }, + "output": { + "type": "Locale", + "identifier": "und-t-m0-true" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-t-m0-true-n0-mixed" + }, + "output": { + "type": "Locale", + "identifier": "und-t-m0-true-n0-mixed" + } + }, + { + "input": { + "type": "Locale", + "identifier": "und-t-m0-true-c0-mixed" + }, + "output": { + "type": "Locale", + "identifier": "und-t-c0-mixed-m0-true" + } + }, + { + "input": { + "type": "Locale", + "identifier": "da-u-ca-gregory-ca-buddhist" + }, + "output": { + "type": "Locale", + "identifier": "da-u-ca-gregory" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pt-u-attr2-attr1-ca-gregory" + }, + "output": { + "type": "Locale", + "identifier": "pt-u-attr1-attr2-ca-gregory" + } + }, + { + "input": { + "type": "Locale", + "identifier": "pt-u-attr1-attr2-attr1-ca-gregory" + }, + "output": { + "type": "Locale", + "identifier": "pt-u-attr1-attr2-ca-gregory" + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-a-not-assigned" + }, + "output": { + "type": "Locale", + "identifier": "en-a-not-assigned" + } + }, + { + "input": { + "type": "Locale", + "identifier": "en-w-bar-u-foo-a-bar-x-u-foo" + }, + "output": { + "type": "Locale", + "identifier": "en-a-bar-u-foo-w-bar-x-u-foo" + } + } +] diff --git a/third_party/rust/icu_locid/tests/fixtures/mod.rs b/third_party/rust/icu_locid/tests/fixtures/mod.rs new file mode 100644 index 0000000000..f00fd6c3b9 --- /dev/null +++ b/third_party/rust/icu_locid/tests/fixtures/mod.rs @@ -0,0 +1,261 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use std::collections::HashMap; +use std::convert::{TryFrom, TryInto}; + +use icu_locid::extensions::private; +use icu_locid::extensions::transform; +use icu_locid::extensions::unicode; +use icu_locid::extensions::Extensions; +use icu_locid::{subtags, LanguageIdentifier, Locale, ParserError}; +use serde::Deserialize; + +#[derive(Debug, Deserialize, Clone)] +pub struct LocaleIdentifier { + #[serde(rename = "type")] + pub field_type: String, + pub identifier: String, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct LocaleExtensionUnicode { + #[serde(default)] + keywords: HashMap<String, Option<String>>, + #[serde(default)] + attributes: Vec<String>, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct LocaleExtensionTransform { + tlang: Option<String>, + #[serde(default)] + tfields: HashMap<String, Option<String>>, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct LocaleExtensions { + unicode: Option<LocaleExtensionUnicode>, + transform: Option<LocaleExtensionTransform>, + #[serde(default)] + private: Vec<String>, + _other: Option<String>, +} + +impl TryFrom<LocaleExtensions> for Extensions { + type Error = ParserError; + + fn try_from(input: LocaleExtensions) -> Result<Self, Self::Error> { + let mut ext = Extensions::default(); + if let Some(unicode) = input.unicode { + ext.unicode.keywords = unicode + .keywords + .iter() + .map(|(k, v)| { + ( + unicode::Key::try_from_bytes(k.as_bytes()).expect("Parsing key failed."), + v.as_ref().map_or( + unicode::Value::try_from_bytes(b"").expect("Failed to parse Value"), + |v| { + unicode::Value::try_from_bytes(v.as_bytes()) + .expect("Parsing type failed.") + }, + ), + ) + }) + .collect(); + let v: Vec<unicode::Attribute> = unicode + .attributes + .iter() + .map(|v| { + unicode::Attribute::try_from_bytes(v.as_bytes()) + .expect("Parsing attribute failed.") + }) + .collect(); + ext.unicode.attributes = unicode::Attributes::from_vec_unchecked(v); + } + if let Some(transform) = input.transform { + ext.transform.fields = transform + .tfields + .iter() + .map(|(k, v)| { + ( + transform::Key::try_from_bytes(k.as_bytes()).expect("Parsing key failed."), + v.as_ref() + .map(|v| { + transform::Value::try_from_bytes(v.as_bytes()) + .expect("Parsing value failed.") + }) + .expect("Value cannot be empty."), + ) + }) + .collect(); + + if let Some(tlang) = transform.tlang { + ext.transform.lang = Some(tlang.parse().expect("Failed to parse tlang.")); + } + } + let v: Vec<private::Subtag> = input + .private + .iter() + .map(|v| private::Subtag::try_from_bytes(v.as_bytes()).expect("Failed to add field.")) + .collect(); + ext.private = private::Private::from_vec_unchecked(v); + Ok(ext) + } +} + +#[derive(Debug, Deserialize, Clone)] +pub struct LocaleSubtags { + #[serde(rename = "type")] + pub field_type: String, + pub language: Option<String>, + pub script: Option<String>, + pub region: Option<String>, + #[serde(default)] + pub variants: Vec<String>, + pub extensions: Option<LocaleExtensions>, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct LocaleError { + pub error: String, + pub text: String, +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(untagged)] +#[allow(clippy::large_enum_variant)] // test code +pub enum LocaleInfo { + String(String), + Error(LocaleError), + Identifier(LocaleIdentifier), + Object(LocaleSubtags), +} + +impl TryFrom<LocaleInfo> for LanguageIdentifier { + type Error = ParserError; + + fn try_from(input: LocaleInfo) -> Result<Self, Self::Error> { + match input { + LocaleInfo::String(s) => s.parse(), + LocaleInfo::Error(e) => Err(e.into()), + LocaleInfo::Identifier(ident) => ident.try_into(), + LocaleInfo::Object(o) => o.try_into(), + } + } +} + +impl TryFrom<LocaleInfo> for Locale { + type Error = ParserError; + + fn try_from(input: LocaleInfo) -> Result<Self, Self::Error> { + match input { + LocaleInfo::String(s) => s.parse(), + LocaleInfo::Error(e) => Err(e.into()), + LocaleInfo::Identifier(ident) => ident.try_into(), + LocaleInfo::Object(o) => o.try_into(), + } + } +} + +impl TryFrom<LocaleIdentifier> for LanguageIdentifier { + type Error = ParserError; + + fn try_from(input: LocaleIdentifier) -> Result<Self, Self::Error> { + LanguageIdentifier::try_from_locale_bytes(input.identifier.as_bytes()) + } +} + +impl TryFrom<LocaleIdentifier> for Locale { + type Error = ParserError; + + fn try_from(input: LocaleIdentifier) -> Result<Self, Self::Error> { + Locale::try_from_bytes(input.identifier.as_bytes()) + } +} + +impl TryFrom<LocaleSubtags> for LanguageIdentifier { + type Error = ParserError; + + fn try_from(subtags: LocaleSubtags) -> Result<Self, Self::Error> { + let language = if let Some(lang) = subtags.language { + lang.parse().expect("Failed to parse language subtag") + } else { + subtags::Language::default() + }; + let script = subtags + .script + .map(|s| s.parse().expect("Failed to parse script subtag.")); + let region = subtags + .region + .map(|s| s.parse().expect("Failed to parse region subtag.")); + let variants = subtags + .variants + .iter() + .map(|v| v.parse().expect("Failed to parse variant subtag.")) + .collect::<Vec<_>>(); + Ok(LanguageIdentifier { + language, + script, + region, + variants: subtags::Variants::from_vec_unchecked(variants), + }) + } +} + +impl TryFrom<LocaleSubtags> for Locale { + type Error = ParserError; + + fn try_from(subtags: LocaleSubtags) -> Result<Self, Self::Error> { + let language = if let Some(lang) = subtags.language { + lang.parse().expect("Failed to parse language subtag") + } else { + subtags::Language::default() + }; + let script = subtags + .script + .map(|s| s.parse().expect("Failed to parse script subtag.")); + let region = subtags + .region + .map(|s| s.parse().expect("Failed to parse region subtag.")); + let variants = subtags + .variants + .iter() + .map(|v| v.parse().expect("Failed to parse variant subtag.")) + .collect::<Vec<_>>(); + let extensions = if let Some(e) = subtags.extensions { + e.try_into().expect("Failed to parse extensions.") + } else { + Extensions::default() + }; + Ok(Locale { + id: LanguageIdentifier { + language, + script, + region, + variants: subtags::Variants::from_vec_unchecked(variants), + }, + extensions, + }) + } +} + +impl From<LocaleError> for ParserError { + fn from(e: LocaleError) -> Self { + match e.error.as_str() { + "InvalidLanguage" => ParserError::InvalidLanguage, + "InvalidSubtag" => ParserError::InvalidSubtag, + "InvalidExtension" => ParserError::InvalidExtension, + "DuplicatedExtension" => ParserError::DuplicatedExtension, + _ => unreachable!("Unknown error name"), + } + } +} + +#[derive(Debug, Deserialize)] +pub struct LocaleTest { + pub input: LocaleInfo, + pub output: LocaleInfo, +} diff --git a/third_party/rust/icu_locid/tests/helpers/mod.rs b/third_party/rust/icu_locid/tests/helpers/mod.rs new file mode 100644 index 0000000000..d250c510c5 --- /dev/null +++ b/third_party/rust/icu_locid/tests/helpers/mod.rs @@ -0,0 +1,15 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use std::fs::File; +use std::io::{BufReader, Error}; + +pub fn read_fixture<T>(path: &str) -> Result<T, Error> +where + T: serde::de::DeserializeOwned, +{ + let file = File::open(path)?; + let reader = BufReader::new(file); + Ok(serde_json::from_reader(reader)?) +} diff --git a/third_party/rust/icu_locid/tests/langid.rs b/third_party/rust/icu_locid/tests/langid.rs new file mode 100644 index 0000000000..ee7bb9817e --- /dev/null +++ b/third_party/rust/icu_locid/tests/langid.rs @@ -0,0 +1,158 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod fixtures; +mod helpers; + +use std::convert::TryInto; +use writeable::*; + +use icu_locid::{subtags, LanguageIdentifier, ParserError}; + +type Result = std::result::Result<LanguageIdentifier, ParserError>; + +fn test_langid_fixtures(tests: Vec<fixtures::LocaleTest>) { + for test in tests { + match test.output { + fixtures::LocaleInfo::String(s) => { + if let fixtures::LocaleInfo::Object(ref o) = &test.input { + if o.field_type == "Locale" { + continue; + } + } + let input: LanguageIdentifier = test.input.try_into().expect("Parsing failed."); + assert_writeable_eq!(input, s); + } + fixtures::LocaleInfo::Error(err) => { + let err: ParserError = err.into(); + let input: Result = test.input.try_into(); + assert_eq!(input, Err(err)); + } + fixtures::LocaleInfo::Identifier(ident) => { + let input: LanguageIdentifier = test.input.try_into().expect("Parsing failed."); + let output: LanguageIdentifier = ident.try_into().expect("Parsing failed."); + assert_eq!(input, output); + } + fixtures::LocaleInfo::Object(o) => { + let input: LanguageIdentifier = test.input.try_into().expect("Parsing failed."); + let output: LanguageIdentifier = o.try_into().expect("Parsing failed."); + assert_eq!(input, output); + } + } + } +} + +#[test] +fn test_langid_parsing() { + let path = "./tests/fixtures/langid.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_langid_invalid() { + let path = "./tests/fixtures/invalid.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_langid_canonicalize() { + let path = "./tests/fixtures/canonicalize.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_langid_from_locale() { + let path = "./tests/fixtures/locale.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_langid_subtag_language() { + let mut lang: subtags::Language = "en".parse().expect("Failed to parse a language."); + assert_eq!(lang.as_str(), "en"); + + lang.clear(); + assert_eq!(lang, subtags::Language::UND); + assert!(lang.is_empty()); + + assert_writeable_eq!(lang, "und"); +} + +#[test] +fn test_langid_subtag_region() { + let region: subtags::Region = "en".parse().expect("Failed to parse a region."); + assert_eq!(region.as_str(), "EN"); + assert_writeable_eq!(region, "EN"); +} + +#[test] +fn test_langid_subtag_script() { + let script: subtags::Script = "Latn".parse().expect("Failed to parse a script."); + assert_eq!(script.as_str(), "Latn"); + assert_writeable_eq!(script, "Latn"); +} + +#[test] +fn test_langid_subtag_variant() { + let variant: subtags::Variant = "macos".parse().expect("Failed to parse a variant."); + assert_eq!(variant.as_str(), "macos"); + assert_writeable_eq!(variant, "macos"); +} + +#[test] +fn test_langid_subtag_variants() { + let variant: subtags::Variant = "macos".parse().expect("Failed to parse a variant."); + let mut variants = subtags::Variants::from_vec_unchecked(vec![variant]); + assert_eq!(variants.get(0), Some(&variant)); + variants.clear(); + assert_eq!(variants.len(), 0); +} + +#[test] +fn test_langid_normalizing_eq_str() { + let path = "./tests/fixtures/langid.json"; + let tests: Vec<fixtures::LocaleTest> = + helpers::read_fixture(path).expect("Failed to read a fixture"); + for test in tests { + let parsed: LanguageIdentifier = test.input.try_into().expect("Parsing failed."); + assert!(parsed.normalizing_eq(&parsed.write_to_string())); + } + + // Check that trailing characters are not ignored + let lang: LanguageIdentifier = "en".parse().expect("Parsing failed."); + assert!(!lang.normalizing_eq("en-US")); +} + +#[test] +fn test_langid_strict_cmp() { + let path = "./tests/fixtures/langid.json"; + let tests: Vec<fixtures::LocaleTest> = + helpers::read_fixture(path).expect("Failed to read a fixture"); + let bcp47_strings = tests + .iter() + .map(|t| match t.input { + fixtures::LocaleInfo::String(ref s) => s.as_str(), + _ => panic!("Invalid fixture"), + }) + .collect::<Vec<&str>>(); + for a in bcp47_strings.iter() { + for b in bcp47_strings.iter() { + let a_langid = a + .parse::<LanguageIdentifier>() + .expect("Invalid BCP-47 in fixture"); + let a_normalized = a_langid.write_to_string(); + let string_cmp = a_normalized.as_bytes().cmp(b.as_bytes()); + let test_cmp = a_langid.strict_cmp(b.as_bytes()); + assert_eq!(string_cmp, test_cmp, "{a:?}/{b:?}"); + } + } +} diff --git a/third_party/rust/icu_locid/tests/locale.rs b/third_party/rust/icu_locid/tests/locale.rs new file mode 100644 index 0000000000..638db41383 --- /dev/null +++ b/third_party/rust/icu_locid/tests/locale.rs @@ -0,0 +1,120 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod fixtures; +mod helpers; + +use std::convert::TryInto; +use writeable::*; + +use icu_locid::{LanguageIdentifier, Locale, ParserError}; + +type Result = std::result::Result<Locale, ParserError>; + +fn test_langid_fixtures(tests: Vec<fixtures::LocaleTest>) { + for test in tests { + match test.output { + fixtures::LocaleInfo::String(s) => { + let input: Locale = test.input.try_into().expect("Parsing failed."); + assert_writeable_eq!(input, s); + } + fixtures::LocaleInfo::Error(err) => { + let err: ParserError = err.into(); + let input: Result = test.input.try_into(); + assert_eq!(input, Err(err)); + } + fixtures::LocaleInfo::Identifier(ident) => { + let input: Locale = test.input.try_into().expect("Parsing failed."); + let output: Locale = ident.clone().try_into().expect("Parsing failed."); + assert_eq!(input, output); + assert_writeable_eq!(input, ident.identifier); + } + fixtures::LocaleInfo::Object(o) => { + let input: Locale = test.input.try_into().expect("Parsing failed."); + let output: Locale = o.try_into().expect("Parsing failed."); + assert_eq!(input, output); + } + } + } +} + +#[test] +fn test_locale_parsing() { + let path = "./tests/fixtures/locale.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_langid_invalid() { + let path = "./tests/fixtures/invalid-extensions.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_locale_is_empty() { + let locale: Locale = Locale::default(); + assert!(locale.extensions.is_empty()); + assert_writeable_eq!(locale, "und"); +} + +#[test] +fn test_locale_conversions() { + let locale: Locale = Locale::default(); + let langid: LanguageIdentifier = locale.clone().into(); + let locale2: Locale = langid.into(); + assert_eq!(locale, locale2); +} + +#[test] +fn test_locale_canonicalize() { + let path = "./tests/fixtures/canonicalize.json"; + let data = helpers::read_fixture(path).expect("Failed to read a fixture"); + + test_langid_fixtures(data); +} + +#[test] +fn test_locale_normalizing_eq_str() { + let path = "./tests/fixtures/locale.json"; + let tests: Vec<fixtures::LocaleTest> = + helpers::read_fixture(path).expect("Failed to read a fixture"); + for test in tests { + let parsed: Locale = test.input.try_into().expect("Parsing failed."); + assert!(parsed.normalizing_eq(&parsed.write_to_string())); + } + + // Check that trailing characters are not ignored + let locale: Locale = "en".parse().expect("Parsing failed."); + assert!(!locale.normalizing_eq("en-US")); +} + +#[test] +fn test_locale_strict_cmp() { + let path = "./tests/fixtures/locale.json"; + let tests: Vec<fixtures::LocaleTest> = + helpers::read_fixture(path).expect("Failed to read a fixture"); + let bcp47_strings = tests + .iter() + .map(|t| match t.input { + fixtures::LocaleInfo::Identifier(ref s) => s.identifier.as_str(), + _ => match t.output { + fixtures::LocaleInfo::Identifier(ref s) => s.identifier.as_str(), + _ => panic!("No string in fixture input or output: {t:?}"), + }, + }) + .collect::<Vec<&str>>(); + for a in bcp47_strings.iter() { + for b in bcp47_strings.iter() { + let a_langid = a.parse::<Locale>().expect("Invalid BCP-47 in fixture"); + let a_normalized = a_langid.write_to_string(); + let string_cmp = a_normalized.as_bytes().cmp(b.as_bytes()); + let test_cmp = a_langid.strict_cmp(b.as_bytes()); + assert_eq!(string_cmp, test_cmp, "{a:?}/{b:?}"); + } + } +} |