From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/rust/tinystr/.cargo-checksum.json | 1 + third_party/rust/tinystr/Cargo.toml | 120 +++ third_party/rust/tinystr/LICENSE | 51 ++ third_party/rust/tinystr/README.md | 53 ++ third_party/rust/tinystr/benches/common/mod.rs | 79 ++ third_party/rust/tinystr/benches/construct.rs | 89 +++ third_party/rust/tinystr/benches/overview.rs | 165 +++++ third_party/rust/tinystr/benches/read.rs | 34 + third_party/rust/tinystr/benches/serde.rs | 37 + third_party/rust/tinystr/src/ascii.rs | 982 +++++++++++++++++++++++++ third_party/rust/tinystr/src/asciibyte.rs | 145 ++++ third_party/rust/tinystr/src/databake.rs | 21 + third_party/rust/tinystr/src/error.rs | 19 + third_party/rust/tinystr/src/int_ops.rs | 315 ++++++++ third_party/rust/tinystr/src/lib.rs | 116 +++ third_party/rust/tinystr/src/macros.rs | 32 + third_party/rust/tinystr/src/serde.rs | 91 +++ third_party/rust/tinystr/src/ule.rs | 76 ++ third_party/rust/tinystr/tests/serde.rs | 39 + 19 files changed, 2465 insertions(+) create mode 100644 third_party/rust/tinystr/.cargo-checksum.json create mode 100644 third_party/rust/tinystr/Cargo.toml create mode 100644 third_party/rust/tinystr/LICENSE create mode 100644 third_party/rust/tinystr/README.md create mode 100644 third_party/rust/tinystr/benches/common/mod.rs create mode 100644 third_party/rust/tinystr/benches/construct.rs create mode 100644 third_party/rust/tinystr/benches/overview.rs create mode 100644 third_party/rust/tinystr/benches/read.rs create mode 100644 third_party/rust/tinystr/benches/serde.rs create mode 100644 third_party/rust/tinystr/src/ascii.rs create mode 100644 third_party/rust/tinystr/src/asciibyte.rs create mode 100644 third_party/rust/tinystr/src/databake.rs create mode 100644 third_party/rust/tinystr/src/error.rs create mode 100644 third_party/rust/tinystr/src/int_ops.rs create mode 100644 third_party/rust/tinystr/src/lib.rs create mode 100644 third_party/rust/tinystr/src/macros.rs create mode 100644 third_party/rust/tinystr/src/serde.rs create mode 100644 third_party/rust/tinystr/src/ule.rs create mode 100644 third_party/rust/tinystr/tests/serde.rs (limited to 'third_party/rust/tinystr') diff --git a/third_party/rust/tinystr/.cargo-checksum.json b/third_party/rust/tinystr/.cargo-checksum.json new file mode 100644 index 0000000000..86027edd3c --- /dev/null +++ b/third_party/rust/tinystr/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"b28bb188b7c3b68f9d9a710921a42f337bd2e07eb7588d983b20724f64d6a8c8","LICENSE":"4ad7541d66a407234e2c84902124cef325c29f3e966353efdb800bedb8b8da21","README.md":"8e79e986c696d6010a578b2872ee4144d86e26d30a409167ff2cf2af551ef231","benches/common/mod.rs":"7a31f89cb68cec2574287636ac22fe3fc86a66688b8b1e99700a5da692bd485e","benches/construct.rs":"0e0e7c1459dd3efea0c734a999318078b53e18c3389c74a1ff5a226cd3d05cca","benches/overview.rs":"296d19b32a2d52e449140771d89f9c099d19177eb84e1395c942469d51c4c3f8","benches/read.rs":"cbf349393a50eb90e7ba53906f98a689d585242292f867a37acf6842263af4d9","benches/serde.rs":"5c88866d08c07088b82dbd5472e6276c632d11e064417f5d8f2025a5ade867f0","src/ascii.rs":"403408b47d813110e840d4db688145c37a17fbcbff173038d9e3743aa712b321","src/asciibyte.rs":"fa29de7403c0424c52c2f30bb47002b9abf4bd08b302c411ffe679d3decfb8de","src/databake.rs":"9f29e30e6deec989822cbdf01f5165e098fa544cf7e49ccea3f5de827648fc1e","src/error.rs":"e0cbc912258d6e56aad148404d7cc3213d89736fa9ebe56c41f6cb0df7b2dd63","src/int_ops.rs":"c2be314d19dd41cf18fb3589901d7e58ee32fe3f764fb6a66b08a1e005336406","src/lib.rs":"7ddbd83bcb9091495de3c4a7eb7ecc25313c54991be8b463d67a7c2e97c076b6","src/macros.rs":"3fe76e258b0db2896284bcf4f50a4ac35b7efc542649b4c9f13c6e71c5957ae4","src/serde.rs":"0bd6bbe2ee8195aea68dd235d59b94faa3419aaeb8939e3220dd64bd888873f5","src/ule.rs":"139543634949a95405bc49862840b0794db089abed6efe66533858376cae180f","tests/serde.rs":"cf8cee82f731928375888d1b5e7e5e50368d3e16ce372fced230c9b1ee2a7451"},"package":"7ac3f5b6856e931e15e07b478e98c8045239829a65f9156d4fa7e7788197a5ef"} \ No newline at end of file diff --git a/third_party/rust/tinystr/Cargo.toml b/third_party/rust/tinystr/Cargo.toml new file mode 100644 index 0000000000..284ce6d2dd --- /dev/null +++ b/third_party/rust/tinystr/Cargo.toml @@ -0,0 +1,120 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +name = "tinystr" +version = "0.7.1" +authors = ["The ICU4X Project Developers"] +include = [ + "src/**/*", + "examples/**/*", + "benches/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md", +] +description = "A small ASCII-only bounded length string representation." +keywords = [ + "string", + "str", + "small", + "tiny", + "no_std", +] +categories = ["data-structures"] +license = "Unicode-DFS-2016" +repository = "https://github.com/unicode-org/icu4x" +resolver = "2" + +[package.metadata.workspaces] +independent = true + +[package.metadata.docs.rs] +all-features = true + +[package.metadata.cargo-all-features] +denylist = ["bench"] + +[[test]] +name = "serde" +required-features = ["serde"] + +[[bench]] +name = "overview" +harness = false + +[[bench]] +name = "construct" +harness = false +required-features = ["bench"] + +[[bench]] +name = "read" +harness = false +required-features = ["bench"] + +[[bench]] +name = "serde" +harness = false +required-features = [ + "bench", + "serde", +] + +[dependencies.databake] +version = "0.1.3" +optional = true + +[dependencies.displaydoc] +version = "0.2.3" +default-features = false + +[dependencies.serde] +version = "1.0.123" +features = ["alloc"] +optional = true +default-features = false + +[dependencies.zerovec] +version = "0.9.2" +optional = true + +[dev-dependencies.bincode] +version = "1.3" + +[dev-dependencies.criterion] +version = "0.3" + +[dev-dependencies.postcard] +version = "1.0.0" +features = ["use-std"] + +[dev-dependencies.rand] +version = "0.8.5" +features = ["small_rng"] + +[dev-dependencies.serde_json] +version = "1.0" +features = ["alloc"] +default-features = false + +[dev-dependencies.tinystr_old] +version = "0.4" +features = ["serde"] +package = "tinystr" + +[features] +alloc = [] +bench = [] +default = ["alloc"] +std = [] diff --git a/third_party/rust/tinystr/LICENSE b/third_party/rust/tinystr/LICENSE new file mode 100644 index 0000000000..9858d01abf --- /dev/null +++ b/third_party/rust/tinystr/LICENSE @@ -0,0 +1,51 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +— + +Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. +ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. diff --git a/third_party/rust/tinystr/README.md b/third_party/rust/tinystr/README.md new file mode 100644 index 0000000000..5f9a8142b8 --- /dev/null +++ b/third_party/rust/tinystr/README.md @@ -0,0 +1,53 @@ +# tinystr [![crates.io](https://img.shields.io/crates/v/tinystr)](https://crates.io/crates/tinystr) + +`tinystr` is a utility crate of the [`ICU4X`] project. + +It includes [`TinyAsciiStr`], a core API for representing small ASCII-only bounded length strings. + +It is optimized for operations on strings of size 8 or smaller. When use cases involve comparison +and conversion of strings for lowercase/uppercase/titlecase, or checking +numeric/alphabetic/alphanumeric, `TinyAsciiStr` is the edge performance library. + +## Examples + +```rust +use tinystr::TinyAsciiStr; + +let s1: TinyAsciiStr<4> = "tEsT".parse().expect("Failed to parse."); + +assert_eq!(s1, "tEsT"); +assert_eq!(s1.to_ascii_uppercase(), "TEST"); +assert_eq!(s1.to_ascii_lowercase(), "test"); +assert_eq!(s1.to_ascii_titlecase(), "Test"); +assert!(s1.is_ascii_alphanumeric()); +assert!(!s1.is_ascii_numeric()); + +let s2 = TinyAsciiStr::<8>::try_from_raw(*b"New York") + .expect("Failed to parse."); + +assert_eq!(s2, "New York"); +assert_eq!(s2.to_ascii_uppercase(), "NEW YORK"); +assert_eq!(s2.to_ascii_lowercase(), "new york"); +assert_eq!(s2.to_ascii_titlecase(), "New york"); +assert!(!s2.is_ascii_alphanumeric()); +``` + +## Details + +When strings are of size 8 or smaller, the struct transforms the strings as `u32`/`u64` and uses +bitmasking to provide basic string manipulation operations: +* `is_ascii_numeric` +* `is_ascii_alphabetic` +* `is_ascii_alphanumeric` +* `to_ascii_lowercase` +* `to_ascii_uppercase` +* `to_ascii_titlecase` +* `PartialEq` + +`TinyAsciiStr` will fall back to `u8` character manipulation for strings of length greater than 8. + +[`ICU4X`]: ../icu/index.html + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/third_party/rust/tinystr/benches/common/mod.rs b/third_party/rust/tinystr/benches/common/mod.rs new file mode 100644 index 0000000000..07654e1d9d --- /dev/null +++ b/third_party/rust/tinystr/benches/common/mod.rs @@ -0,0 +1,79 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// This file was adapted from parts of https://github.com/zbraniecki/tinystr + +pub static STRINGS_4: &[&str] = &[ + "US", "GB", "AR", "Hans", "CN", "AT", "PL", "FR", "AT", "Cyrl", "SR", "NO", "FR", "MK", "UK", +]; + +pub static STRINGS_8: &[&str] = &[ + "Latn", "windows", "AR", "Hans", "macos", "AT", "pl", "FR", "en", "Cyrl", "SR", "NO", "419", + "und", "UK", +]; + +pub static STRINGS_16: &[&str] = &[ + "Latn", + "windows", + "AR", + "Hans", + "macos", + "AT", + "infiniband", + "FR", + "en", + "Cyrl", + "FromIntegral", + "NO", + "419", + "MacintoshOSX2019", + "UK", +]; + +#[macro_export] +macro_rules! bench_block { + ($c:expr, $name:expr, $action:ident) => { + let mut group4 = $c.benchmark_group(&format!("{}/4", $name)); + group4.bench_function("String", $action!(String, STRINGS_4)); + group4.bench_function("TinyAsciiStr<4>", $action!(TinyAsciiStr<4>, STRINGS_4)); + group4.bench_function( + "tinystr_old::TinyStr4", + $action!(tinystr_old::TinyStr4, STRINGS_4), + ); + group4.bench_function("TinyAsciiStr<8>", $action!(TinyAsciiStr<8>, STRINGS_4)); + group4.bench_function( + "tinystr_old::TinyStr8", + $action!(tinystr_old::TinyStr8, STRINGS_4), + ); + group4.bench_function("TinyAsciiStr<16>", $action!(TinyAsciiStr<16>, STRINGS_4)); + group4.bench_function( + "tinystr_old::TinyStr16", + $action!(tinystr_old::TinyStr16, STRINGS_4), + ); + group4.finish(); + + let mut group8 = $c.benchmark_group(&format!("{}/8", $name)); + group8.bench_function("String", $action!(String, STRINGS_8)); + group8.bench_function("TinyAsciiStr<8>", $action!(TinyAsciiStr<8>, STRINGS_8)); + group8.bench_function("TinyAsciiStr<16>", $action!(TinyAsciiStr<16>, STRINGS_8)); + group8.bench_function( + "tinystr_old::TinyStr8", + $action!(tinystr_old::TinyStr8, STRINGS_8), + ); + group8.bench_function( + "tinystr_old::TinyStr16", + $action!(tinystr_old::TinyStr16, STRINGS_8), + ); + group8.finish(); + + let mut group16 = $c.benchmark_group(&format!("{}/16", $name)); + group16.bench_function("String", $action!(String, STRINGS_16)); + group16.bench_function("TinyAsciiStr<16>", $action!(TinyAsciiStr<16>, STRINGS_16)); + group16.bench_function( + "tinystr_old::TinyStr16", + $action!(tinystr_old::TinyStr16, STRINGS_16), + ); + group16.finish(); + }; +} diff --git a/third_party/rust/tinystr/benches/construct.rs b/third_party/rust/tinystr/benches/construct.rs new file mode 100644 index 0000000000..145e721e12 --- /dev/null +++ b/third_party/rust/tinystr/benches/construct.rs @@ -0,0 +1,89 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// This file was adapted from https://github.com/zbraniecki/tinystr + +mod common; +use common::*; + +use criterion::black_box; +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::Bencher; +use criterion::Criterion; + +use tinystr::TinyAsciiStr; + +fn construct_from_str(c: &mut Criterion) { + macro_rules! cfs { + ($r:ty, $inputs:expr) => { + |b: &mut Bencher| { + b.iter(|| { + for s in $inputs { + let _: $r = black_box(s.parse().unwrap()); + } + }) + } + }; + } + + bench_block!(c, "construct_from_str", cfs); +} + +fn construct_from_bytes(c: &mut Criterion) { + macro_rules! cfu { + ($r:ty, $inputs:expr) => { + |b| { + let raw: Vec<&[u8]> = $inputs.iter().map(|s| s.as_bytes()).collect(); + b.iter(move || { + for u in &raw { + let _ = black_box(<$r>::from_bytes(*u).unwrap()); + } + }) + } + }; + } + + let mut group4 = c.benchmark_group("construct_from_bytes/4"); + group4.bench_function("TinyAsciiStr<4>", cfu!(TinyAsciiStr<4>, STRINGS_4)); + group4.bench_function( + "tinystr_old::TinyStr4", + cfu!(tinystr_old::TinyStr4, STRINGS_4), + ); + group4.bench_function("TinyAsciiStr<8>", cfu!(TinyAsciiStr<8>, STRINGS_4)); + group4.bench_function( + "tinystr_old::TinyStr8", + cfu!(tinystr_old::TinyStr8, STRINGS_4), + ); + group4.bench_function("TinyAsciiStr<16>", cfu!(TinyAsciiStr<16>, STRINGS_4)); + group4.bench_function( + "tinystr_old::TinyStr16", + cfu!(tinystr_old::TinyStr16, STRINGS_4), + ); + group4.finish(); + + let mut group8 = c.benchmark_group("construct_from_bytes/8"); + group8.bench_function("TinyAsciiStr<8>", cfu!(TinyAsciiStr<8>, STRINGS_8)); + group8.bench_function( + "tinystr_old::TinyStr8", + cfu!(tinystr_old::TinyStr8, STRINGS_8), + ); + group8.bench_function("TinyAsciiStr<16>", cfu!(TinyAsciiStr<16>, STRINGS_8)); + group8.bench_function( + "tinystr_old::TinyStr16", + cfu!(tinystr_old::TinyStr16, STRINGS_8), + ); + group8.finish(); + + let mut group16 = c.benchmark_group("construct_from_bytes/16"); + group16.bench_function("TinyAsciiStr<16>", cfu!(TinyAsciiStr<16>, STRINGS_16)); + group16.bench_function( + "tinystr_old::TinyStr16", + cfu!(tinystr_old::TinyStr16, STRINGS_16), + ); + group16.finish(); +} + +criterion_group!(benches, construct_from_str, construct_from_bytes,); +criterion_main!(benches); diff --git a/third_party/rust/tinystr/benches/overview.rs b/third_party/rust/tinystr/benches/overview.rs new file mode 100644 index 0000000000..4911832ec4 --- /dev/null +++ b/third_party/rust/tinystr/benches/overview.rs @@ -0,0 +1,165 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod common; +use common::*; + +use criterion::black_box; +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::Criterion; + +use tinystr::TinyAsciiStr; +use tinystr_old::TinyStr16; +use tinystr_old::TinyStr4; +use tinystr_old::TinyStr8; + +fn overview(c: &mut Criterion) { + let mut g = c.benchmark_group("overview"); + + g.bench_function("construct/TinyAsciiStr", |b| { + b.iter(|| { + for s in STRINGS_4 { + let _: TinyAsciiStr<4> = black_box(s).parse().unwrap(); + let _: TinyAsciiStr<8> = black_box(s).parse().unwrap(); + let _: TinyAsciiStr<16> = black_box(s).parse().unwrap(); + } + for s in STRINGS_8 { + let _: TinyAsciiStr<8> = black_box(s).parse().unwrap(); + let _: TinyAsciiStr<16> = black_box(s).parse().unwrap(); + } + for s in STRINGS_16 { + let _: TinyAsciiStr<16> = black_box(s).parse().unwrap(); + } + }); + }); + + g.bench_function("construct/TinyStr", |b| { + b.iter(|| { + for s in STRINGS_4 { + let _: TinyStr4 = black_box(s).parse().unwrap(); + let _: TinyStr8 = black_box(s).parse().unwrap(); + let _: TinyStr16 = black_box(s).parse().unwrap(); + } + for s in STRINGS_8 { + let _: TinyStr8 = black_box(s).parse().unwrap(); + let _: TinyStr16 = black_box(s).parse().unwrap(); + } + for s in STRINGS_16 { + let _: TinyStr16 = black_box(s).parse().unwrap(); + } + }); + }); + + let parsed_ascii_4: Vec> = STRINGS_4 + .iter() + .map(|s| s.parse::>().unwrap()) + .collect(); + let parsed_ascii_8: Vec> = STRINGS_4 + .iter() + .chain(STRINGS_8) + .map(|s| s.parse::>().unwrap()) + .collect(); + let parsed_ascii_16: Vec> = STRINGS_4 + .iter() + .chain(STRINGS_8) + .chain(STRINGS_16) + .map(|s| s.parse::>().unwrap()) + .collect(); + + let parsed_tiny_4: Vec = STRINGS_4 + .iter() + .map(|s| s.parse::().unwrap()) + .collect(); + let parsed_tiny_8: Vec = STRINGS_4 + .iter() + .chain(STRINGS_8) + .map(|s| s.parse::().unwrap()) + .collect(); + let parsed_tiny_16: Vec = STRINGS_4 + .iter() + .chain(STRINGS_8) + .chain(STRINGS_16) + .map(|s| s.parse::().unwrap()) + .collect(); + + g.bench_function("read/TinyAsciiStr", |b| { + b.iter(|| { + let mut collector: usize = 0; + for t in black_box(&parsed_ascii_4) { + let s: &str = t; + collector += s.bytes().map(usize::from).sum::(); + } + for t in black_box(&parsed_ascii_8) { + let s: &str = t; + collector += s.bytes().map(usize::from).sum::(); + } + for t in black_box(&parsed_ascii_16) { + let s: &str = t; + collector += s.bytes().map(usize::from).sum::(); + } + collector + }); + }); + + g.bench_function("read/TinyStr", |b| { + b.iter(|| { + let mut collector: usize = 0; + for t in black_box(&parsed_tiny_4) { + let s: &str = t; + collector += s.bytes().map(usize::from).sum::(); + } + for t in black_box(&parsed_tiny_8) { + let s: &str = t; + collector += s.bytes().map(usize::from).sum::(); + } + for t in black_box(&parsed_tiny_16) { + let s: &str = t; + collector += s.bytes().map(usize::from).sum::(); + } + collector + }); + }); + + g.bench_function("compare/TinyAsciiStr", |b| { + b.iter(|| { + let mut collector: usize = 0; + for ts in black_box(&parsed_ascii_4).windows(2) { + let o = ts[0].cmp(&ts[1]); + collector ^= o as usize; + } + for ts in black_box(&parsed_ascii_8).windows(2) { + let o = ts[0].cmp(&ts[1]); + collector ^= o as usize; + } + for ts in black_box(&parsed_ascii_16).windows(2) { + let o = ts[0].cmp(&ts[1]); + collector ^= o as usize; + } + collector + }); + }); + + g.bench_function("compare/TinyStr", |b| { + b.iter(|| { + let mut collector: usize = 0; + for ts in black_box(&parsed_tiny_4).windows(2) { + let o = ts[0].cmp(&ts[1]); + collector ^= o as usize; + } + for ts in black_box(&parsed_tiny_8).windows(2) { + let o = ts[0].cmp(&ts[1]); + collector ^= o as usize; + } + for ts in black_box(&parsed_tiny_16).windows(2) { + let o = ts[0].cmp(&ts[1]); + collector ^= o as usize; + } + collector + }); + }); +} + +criterion_group!(benches, overview,); +criterion_main!(benches); diff --git a/third_party/rust/tinystr/benches/read.rs b/third_party/rust/tinystr/benches/read.rs new file mode 100644 index 0000000000..793bb14f87 --- /dev/null +++ b/third_party/rust/tinystr/benches/read.rs @@ -0,0 +1,34 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod common; +use common::*; + +use criterion::black_box; +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::Bencher; +use criterion::Criterion; + +use tinystr::TinyAsciiStr; + +fn read(c: &mut Criterion) { + macro_rules! cfs { + ($r:ty, $inputs:expr) => { + |b: &mut Bencher| { + let parsed: Vec<$r> = $inputs.iter().map(|s| s.parse().unwrap()).collect(); + b.iter(|| { + for s in &parsed { + let _: &str = black_box(&**s); + } + }) + } + }; + } + + bench_block!(c, "read", cfs); +} + +criterion_group!(benches, read,); +criterion_main!(benches); diff --git a/third_party/rust/tinystr/benches/serde.rs b/third_party/rust/tinystr/benches/serde.rs new file mode 100644 index 0000000000..b0341221d0 --- /dev/null +++ b/third_party/rust/tinystr/benches/serde.rs @@ -0,0 +1,37 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod common; +use common::*; + +use criterion::black_box; +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::Bencher; +use criterion::Criterion; + +use tinystr::TinyAsciiStr; + +fn deserialize(c: &mut Criterion) { + macro_rules! cfs { + ($r:ty, $inputs:expr) => { + |b: &mut Bencher| { + let serialized: Vec> = $inputs + .iter() + .map(|s| postcard::to_stdvec(&s.parse::<$r>().unwrap()).unwrap()) + .collect(); + b.iter(|| { + for bytes in &serialized { + let _: Result<$r, _> = black_box(postcard::from_bytes(bytes)); + } + }) + } + }; + } + + bench_block!(c, "deserialize", cfs); +} + +criterion_group!(benches, deserialize,); +criterion_main!(benches); diff --git a/third_party/rust/tinystr/src/ascii.rs b/third_party/rust/tinystr/src/ascii.rs new file mode 100644 index 0000000000..f39f39b734 --- /dev/null +++ b/third_party/rust/tinystr/src/ascii.rs @@ -0,0 +1,982 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::asciibyte::AsciiByte; +use crate::int_ops::{Aligned4, Aligned8}; +use crate::TinyStrError; +use core::fmt; +use core::ops::Deref; +use core::str::{self, FromStr}; + +#[repr(transparent)] +#[derive(PartialEq, Eq, Ord, PartialOrd, Copy, Clone, Hash)] +pub struct TinyAsciiStr { + bytes: [AsciiByte; N], +} + +impl TinyAsciiStr { + /// Creates a `TinyAsciiStr` from the given byte slice. + /// `bytes` may contain at most `N` non-null ASCII bytes. + pub const fn from_bytes(bytes: &[u8]) -> Result { + Self::from_bytes_inner(bytes, 0, bytes.len(), false) + } + + /// Attempts to parse a fixed-length byte array to a `TinyAsciiStr`. + /// + /// The byte array may contain trailing NUL bytes. + /// + /// # Example + /// + /// ``` + /// use tinystr::tinystr; + /// use tinystr::TinyAsciiStr; + /// + /// assert_eq!( + /// TinyAsciiStr::<3>::try_from_raw(*b"GB\0"), + /// Ok(tinystr!(3, "GB")) + /// ); + /// assert_eq!( + /// TinyAsciiStr::<3>::try_from_raw(*b"USD"), + /// Ok(tinystr!(3, "USD")) + /// ); + /// assert!(matches!(TinyAsciiStr::<3>::try_from_raw(*b"\0A\0"), Err(_))); + /// ``` + pub const fn try_from_raw(raw: [u8; N]) -> Result { + Self::from_bytes_inner(&raw, 0, N, true) + } + + /// Equivalent to [`from_bytes(bytes[start..end])`](Self::from_bytes), + /// but callable in a `const` context (which range indexing is not). + pub const fn from_bytes_manual_slice( + bytes: &[u8], + start: usize, + end: usize, + ) -> Result { + Self::from_bytes_inner(bytes, start, end, false) + } + + #[inline] + pub(crate) const fn from_bytes_inner( + bytes: &[u8], + start: usize, + end: usize, + allow_trailing_null: bool, + ) -> Result { + let len = end - start; + if len > N { + return Err(TinyStrError::TooLarge { max: N, len }); + } + + let mut out = [0; N]; + let mut i = 0; + let mut found_null = false; + // Indexing is protected by TinyStrError::TooLarge + #[allow(clippy::indexing_slicing)] + while i < len { + let b = bytes[start + i]; + + if b == 0 { + found_null = true; + } else if b >= 0x80 { + return Err(TinyStrError::NonAscii); + } else if found_null { + // Error if there are contentful bytes after null + return Err(TinyStrError::ContainsNull); + } + out[i] = b; + + i += 1; + } + + if !allow_trailing_null && found_null { + // We found some trailing nulls, error + return Err(TinyStrError::ContainsNull); + } + + Ok(Self { + // SAFETY: `out` only contains ASCII bytes and has same size as `self.bytes` + bytes: unsafe { AsciiByte::to_ascii_byte_array(&out) }, + }) + } + + // TODO: This function shadows the FromStr trait. Rename? + #[inline] + pub const fn from_str(s: &str) -> Result { + Self::from_bytes_inner(s.as_bytes(), 0, s.len(), false) + } + + #[inline] + pub const fn as_str(&self) -> &str { + // as_bytes is valid utf8 + unsafe { str::from_utf8_unchecked(self.as_bytes()) } + } + + #[inline] + #[must_use] + pub const fn len(&self) -> usize { + if N <= 4 { + Aligned4::from_ascii_bytes(&self.bytes).len() + } else if N <= 8 { + Aligned8::from_ascii_bytes(&self.bytes).len() + } else { + let mut i = 0; + #[allow(clippy::indexing_slicing)] // < N is safe + while i < N && self.bytes[i] as u8 != AsciiByte::B0 as u8 { + i += 1 + } + i + } + } + + #[inline] + #[must_use] + pub const fn is_empty(&self) -> bool { + self.bytes[0] as u8 == AsciiByte::B0 as u8 + } + + #[inline] + #[must_use] + pub const fn as_bytes(&self) -> &[u8] { + // Safe because `self.bytes.as_slice()` pointer-casts to `&[u8]`, + // and changing the length of that slice to self.len() < N is safe. + unsafe { core::mem::transmute((self.bytes.as_slice().as_ptr(), self.len())) } + } + + #[inline] + #[must_use] + pub const fn all_bytes(&self) -> &[u8; N] { + // SAFETY: `self.bytes` has same size as [u8; N] + unsafe { core::mem::transmute(&self.bytes) } + } + + #[inline] + #[must_use] + /// Resizes a TinyAsciiStr to a TinyAsciiStr. + /// + /// If M < len() the string gets truncated, otherwise only the + /// memory representation changes. + pub const fn resize(self) -> TinyAsciiStr { + let mut bytes = [0; M]; + let mut i = 0; + // Indexing is protected by the loop guard + #[allow(clippy::indexing_slicing)] + while i < M && i < N { + bytes[i] = self.bytes[i] as u8; + i += 1; + } + // `self.bytes` only contains ASCII bytes, with no null bytes between + // ASCII characters, so this also holds for `bytes`. + unsafe { TinyAsciiStr::from_bytes_unchecked(bytes) } + } + + /// # Safety + /// Must be called with a bytes array made of valid ASCII bytes, with no null bytes + /// between ASCII characters + #[must_use] + pub const unsafe fn from_bytes_unchecked(bytes: [u8; N]) -> Self { + Self { + bytes: AsciiByte::to_ascii_byte_array(&bytes), + } + } +} + +macro_rules! check_is { + ($self:ident, $check_int:ident, $check_u8:ident) => { + if N <= 4 { + Aligned4::from_ascii_bytes(&$self.bytes).$check_int() + } else if N <= 8 { + Aligned8::from_ascii_bytes(&$self.bytes).$check_int() + } else { + let mut i = 0; + // Won't panic because self.bytes has length N + #[allow(clippy::indexing_slicing)] + while i < N && $self.bytes[i] as u8 != AsciiByte::B0 as u8 { + if !($self.bytes[i] as u8).$check_u8() { + return false; + } + i += 1; + } + true + } + }; + ($self:ident, $check_int:ident, !$check_u8_0_inv:ident, !$check_u8_1_inv:ident) => { + if N <= 4 { + Aligned4::from_ascii_bytes(&$self.bytes).$check_int() + } else if N <= 8 { + Aligned8::from_ascii_bytes(&$self.bytes).$check_int() + } else { + // Won't panic because N is > 8 + if ($self.bytes[0] as u8).$check_u8_0_inv() { + return false; + } + let mut i = 1; + // Won't panic because self.bytes has length N + #[allow(clippy::indexing_slicing)] + while i < N && $self.bytes[i] as u8 != AsciiByte::B0 as u8 { + if ($self.bytes[i] as u8).$check_u8_1_inv() { + return false; + } + i += 1; + } + true + } + }; + ($self:ident, $check_int:ident, $check_u8_0_inv:ident, $check_u8_1_inv:ident) => { + if N <= 4 { + Aligned4::from_ascii_bytes(&$self.bytes).$check_int() + } else if N <= 8 { + Aligned8::from_ascii_bytes(&$self.bytes).$check_int() + } else { + // Won't panic because N is > 8 + if !($self.bytes[0] as u8).$check_u8_0_inv() { + return false; + } + let mut i = 1; + // Won't panic because self.bytes has length N + #[allow(clippy::indexing_slicing)] + while i < N && $self.bytes[i] as u8 != AsciiByte::B0 as u8 { + if !($self.bytes[i] as u8).$check_u8_1_inv() { + return false; + } + i += 1; + } + true + } + }; +} + +impl TinyAsciiStr { + /// Checks if the value is composed of ASCII alphabetic characters: + /// + /// * U+0041 'A' ..= U+005A 'Z', or + /// * U+0061 'a' ..= U+007A 'z'. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "Test".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "Te3t".parse().expect("Failed to parse."); + /// + /// assert!(s1.is_ascii_alphabetic()); + /// assert!(!s2.is_ascii_alphabetic()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_alphabetic(&self) -> bool { + check_is!(self, is_ascii_alphabetic, is_ascii_alphabetic) + } + + /// Checks if the value is composed of ASCII alphanumeric characters: + /// + /// * U+0041 'A' ..= U+005A 'Z', or + /// * U+0061 'a' ..= U+007A 'z', or + /// * U+0030 '0' ..= U+0039 '9'. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "A15b".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "[3@w".parse().expect("Failed to parse."); + /// + /// assert!(s1.is_ascii_alphanumeric()); + /// assert!(!s2.is_ascii_alphanumeric()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_alphanumeric(&self) -> bool { + check_is!(self, is_ascii_alphanumeric, is_ascii_alphanumeric) + } + + /// Checks if the value is composed of ASCII decimal digits: + /// + /// * U+0030 '0' ..= U+0039 '9'. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "312".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "3d".parse().expect("Failed to parse."); + /// + /// assert!(s1.is_ascii_numeric()); + /// assert!(!s2.is_ascii_numeric()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_numeric(&self) -> bool { + check_is!(self, is_ascii_numeric, is_ascii_digit) + } + + /// Checks if the value is in ASCII lower case. + /// + /// All letter characters are checked for case. Non-letter characters are ignored. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "test".parse().expect("Failed to parse."); + /// let s3: TinyAsciiStr<4> = "001z".parse().expect("Failed to parse."); + /// + /// assert!(!s1.is_ascii_lowercase()); + /// assert!(s2.is_ascii_lowercase()); + /// assert!(s3.is_ascii_lowercase()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_lowercase(&self) -> bool { + check_is!( + self, + is_ascii_lowercase, + !is_ascii_uppercase, + !is_ascii_uppercase + ) + } + + /// Checks if the value is in ASCII title case. + /// + /// This verifies that the first character is ASCII uppercase and all others ASCII lowercase. + /// Non-letter characters are ignored. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "Test".parse().expect("Failed to parse."); + /// let s3: TinyAsciiStr<4> = "001z".parse().expect("Failed to parse."); + /// + /// assert!(!s1.is_ascii_titlecase()); + /// assert!(s2.is_ascii_titlecase()); + /// assert!(s3.is_ascii_titlecase()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_titlecase(&self) -> bool { + check_is!( + self, + is_ascii_titlecase, + !is_ascii_lowercase, + !is_ascii_uppercase + ) + } + + /// Checks if the value is in ASCII upper case. + /// + /// All letter characters are checked for case. Non-letter characters are ignored. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "TEST".parse().expect("Failed to parse."); + /// let s3: TinyAsciiStr<4> = "001z".parse().expect("Failed to parse."); + /// + /// assert!(!s1.is_ascii_uppercase()); + /// assert!(s2.is_ascii_uppercase()); + /// assert!(!s3.is_ascii_uppercase()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_uppercase(&self) -> bool { + check_is!( + self, + is_ascii_uppercase, + !is_ascii_lowercase, + !is_ascii_lowercase + ) + } + + /// Checks if the value is composed of ASCII alphabetic lower case characters: + /// + /// * U+0061 'a' ..= U+007A 'z', + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "Test".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "Te3t".parse().expect("Failed to parse."); + /// let s3: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// let s4: TinyAsciiStr<4> = "test".parse().expect("Failed to parse."); + /// let s5: TinyAsciiStr<4> = "001z".parse().expect("Failed to parse."); + /// + /// assert!(!s1.is_ascii_alphabetic_lowercase()); + /// assert!(!s2.is_ascii_alphabetic_lowercase()); + /// assert!(!s3.is_ascii_alphabetic_lowercase()); + /// assert!(s4.is_ascii_alphabetic_lowercase()); + /// assert!(!s5.is_ascii_alphabetic_lowercase()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { + check_is!( + self, + is_ascii_alphabetic_lowercase, + is_ascii_lowercase, + is_ascii_lowercase + ) + } + + /// Checks if the value is composed of ASCII alphabetic, with the first character being ASCII uppercase, and all others ASCII lowercase. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "Test".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "Te3t".parse().expect("Failed to parse."); + /// let s3: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// let s4: TinyAsciiStr<4> = "test".parse().expect("Failed to parse."); + /// let s5: TinyAsciiStr<4> = "001z".parse().expect("Failed to parse."); + /// + /// assert!(s1.is_ascii_alphabetic_titlecase()); + /// assert!(!s2.is_ascii_alphabetic_titlecase()); + /// assert!(!s3.is_ascii_alphabetic_titlecase()); + /// assert!(!s4.is_ascii_alphabetic_titlecase()); + /// assert!(!s5.is_ascii_alphabetic_titlecase()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { + check_is!( + self, + is_ascii_alphabetic_titlecase, + is_ascii_uppercase, + is_ascii_lowercase + ) + } + + /// Checks if the value is composed of ASCII alphabetic upper case characters: + /// + /// * U+0041 'A' ..= U+005A 'Z', + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "Test".parse().expect("Failed to parse."); + /// let s2: TinyAsciiStr<4> = "Te3t".parse().expect("Failed to parse."); + /// let s3: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// let s4: TinyAsciiStr<4> = "TEST".parse().expect("Failed to parse."); + /// let s5: TinyAsciiStr<4> = "001z".parse().expect("Failed to parse."); + /// + /// assert!(!s1.is_ascii_alphabetic_uppercase()); + /// assert!(!s2.is_ascii_alphabetic_uppercase()); + /// assert!(!s3.is_ascii_alphabetic_uppercase()); + /// assert!(s4.is_ascii_alphabetic_uppercase()); + /// assert!(!s5.is_ascii_alphabetic_uppercase()); + /// ``` + #[inline] + #[must_use] + pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { + check_is!( + self, + is_ascii_alphabetic_uppercase, + is_ascii_uppercase, + is_ascii_uppercase + ) + } +} + +macro_rules! to { + ($self:ident, $to:ident, $later_char_to:ident $(,$first_char_to:ident)?) => {{ + let mut i = 0; + if N <= 4 { + let aligned = Aligned4::from_ascii_bytes(&$self.bytes).$to().to_ascii_bytes(); + // Won't panic because self.bytes has length N and aligned has length >= N + #[allow(clippy::indexing_slicing)] + while i < N { + $self.bytes[i] = aligned[i]; + i += 1; + } + } else if N <= 8 { + let aligned = Aligned8::from_ascii_bytes(&$self.bytes).$to().to_ascii_bytes(); + // Won't panic because self.bytes has length N and aligned has length >= N + #[allow(clippy::indexing_slicing)] + while i < N { + $self.bytes[i] = aligned[i]; + i += 1; + } + } else { + // Won't panic because self.bytes has length N + #[allow(clippy::indexing_slicing)] + while i < N && $self.bytes[i] as u8 != AsciiByte::B0 as u8 { + // SAFETY: AsciiByte is repr(u8) and has same size as u8 + unsafe { + $self.bytes[i] = core::mem::transmute( + ($self.bytes[i] as u8).$later_char_to() + ); + } + i += 1; + } + // SAFETY: AsciiByte is repr(u8) and has same size as u8 + $( + $self.bytes[0] = unsafe { + core::mem::transmute(($self.bytes[0] as u8).$first_char_to()) + }; + )? + } + $self + }}; +} + +impl TinyAsciiStr { + /// Converts this type to its ASCII lower case equivalent in-place. + /// + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', other characters are unchanged. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "TeS3".parse().expect("Failed to parse."); + /// + /// assert_eq!(&*s1.to_ascii_lowercase(), "tes3"); + /// ``` + #[inline] + #[must_use] + pub const fn to_ascii_lowercase(mut self) -> Self { + to!(self, to_ascii_lowercase, to_ascii_lowercase) + } + + /// Converts this type to its ASCII title case equivalent in-place. + /// + /// The first character is converted to ASCII uppercase; the remaining characters + /// are converted to ASCII lowercase. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "teSt".parse().expect("Failed to parse."); + /// + /// assert_eq!(&*s1.to_ascii_titlecase(), "Test"); + /// ``` + #[inline] + #[must_use] + pub const fn to_ascii_titlecase(mut self) -> Self { + to!( + self, + to_ascii_titlecase, + to_ascii_lowercase, + to_ascii_uppercase + ) + } + + /// Converts this type to its ASCII upper case equivalent in-place. + /// + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', other characters are unchanged. + /// + /// # Examples + /// + /// ``` + /// use tinystr::TinyAsciiStr; + /// + /// let s1: TinyAsciiStr<4> = "Tes3".parse().expect("Failed to parse."); + /// + /// assert_eq!(&*s1.to_ascii_uppercase(), "TES3"); + /// ``` + #[inline] + #[must_use] + pub const fn to_ascii_uppercase(mut self) -> Self { + to!(self, to_ascii_uppercase, to_ascii_uppercase) + } +} + +impl fmt::Debug for TinyAsciiStr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self.as_str(), f) + } +} + +impl fmt::Display for TinyAsciiStr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self.as_str(), f) + } +} + +impl Deref for TinyAsciiStr { + type Target = str; + #[inline] + fn deref(&self) -> &str { + self.as_str() + } +} + +impl FromStr for TinyAsciiStr { + type Err = TinyStrError; + #[inline] + fn from_str(s: &str) -> Result { + Self::from_str(s) + } +} + +impl PartialEq for TinyAsciiStr { + fn eq(&self, other: &str) -> bool { + self.deref() == other + } +} + +impl PartialEq<&str> for TinyAsciiStr { + fn eq(&self, other: &&str) -> bool { + self.deref() == *other + } +} + +#[cfg(feature = "alloc")] +impl PartialEq for TinyAsciiStr { + fn eq(&self, other: &alloc::string::String) -> bool { + self.deref() == other.deref() + } +} + +#[cfg(feature = "alloc")] +impl PartialEq> for alloc::string::String { + fn eq(&self, other: &TinyAsciiStr) -> bool { + self.deref() == other.deref() + } +} + +#[cfg(test)] +mod test { + use super::*; + use rand::distributions::Distribution; + use rand::distributions::Standard; + use rand::rngs::SmallRng; + use rand::seq::SliceRandom; + use rand::SeedableRng; + + const STRINGS: [&str; 26] = [ + "Latn", + "laTn", + "windows", + "AR", + "Hans", + "macos", + "AT", + "infiniband", + "FR", + "en", + "Cyrl", + "FromIntegral", + "NO", + "419", + "MacintoshOSX2019", + "a3z", + "A3z", + "A3Z", + "a3Z", + "3A", + "3Z", + "3a", + "3z", + "@@[`{", + "UK", + "E12", + ]; + + fn gen_strings(num_strings: usize, allowed_lengths: &[usize]) -> Vec { + let mut rng = SmallRng::seed_from_u64(2022); + // Need to do this in 2 steps since the RNG is needed twice + let string_lengths = core::iter::repeat_with(|| *allowed_lengths.choose(&mut rng).unwrap()) + .take(num_strings) + .collect::>(); + string_lengths + .iter() + .map(|len| { + Standard + .sample_iter(&mut rng) + .filter(|b: &u8| *b > 0 && *b < 0x80) + .take(*len) + .collect::>() + }) + .map(|byte_vec| String::from_utf8(byte_vec).expect("All ASCII")) + .collect() + } + + fn check_operation(reference_f: F1, tinystr_f: F2) + where + F1: Fn(&str) -> T, + F2: Fn(TinyAsciiStr) -> T, + T: core::fmt::Debug + core::cmp::PartialEq, + { + for s in STRINGS + .into_iter() + .map(str::to_owned) + .chain(gen_strings(100, &[3, 4, 5, 8, 12])) + { + let t = match TinyAsciiStr::::from_str(&s) { + Ok(t) => t, + Err(TinyStrError::TooLarge { .. }) => continue, + Err(e) => panic!("{}", e), + }; + let expected = reference_f(&s); + let actual = tinystr_f(t); + assert_eq!(expected, actual, "TinyAsciiStr<{}>: {:?}", N, s); + } + } + + #[test] + fn test_is_ascii_alphabetic() { + fn check() { + check_operation( + |s| s.chars().all(|c| c.is_ascii_alphabetic()), + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_alphabetic(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_alphanumeric() { + fn check() { + check_operation( + |s| s.chars().all(|c| c.is_ascii_alphanumeric()), + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_alphanumeric(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_numeric() { + fn check() { + check_operation( + |s| s.chars().all(|c| c.is_ascii_digit()), + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_numeric(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_lowercase() { + fn check() { + check_operation( + |s| { + s == TinyAsciiStr::<16>::from_str(s) + .unwrap() + .to_ascii_lowercase() + .as_str() + }, + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_lowercase(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_titlecase() { + fn check() { + check_operation( + |s| { + s == TinyAsciiStr::<16>::from_str(s) + .unwrap() + .to_ascii_titlecase() + .as_str() + }, + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_titlecase(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_uppercase() { + fn check() { + check_operation( + |s| { + s == TinyAsciiStr::<16>::from_str(s) + .unwrap() + .to_ascii_uppercase() + .as_str() + }, + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_uppercase(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_alphabetic_lowercase() { + fn check() { + check_operation( + |s| { + // Check alphabetic + s.chars().all(|c| c.is_ascii_alphabetic()) && + // Check lowercase + s == TinyAsciiStr::<16>::from_str(s) + .unwrap() + .to_ascii_lowercase() + .as_str() + }, + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_alphabetic_lowercase(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_alphabetic_titlecase() { + fn check() { + check_operation( + |s| { + // Check alphabetic + s.chars().all(|c| c.is_ascii_alphabetic()) && + // Check titlecase + s == TinyAsciiStr::<16>::from_str(s) + .unwrap() + .to_ascii_titlecase() + .as_str() + }, + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_alphabetic_titlecase(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_is_ascii_alphabetic_uppercase() { + fn check() { + check_operation( + |s| { + // Check alphabetic + s.chars().all(|c| c.is_ascii_alphabetic()) && + // Check uppercase + s == TinyAsciiStr::<16>::from_str(s) + .unwrap() + .to_ascii_uppercase() + .as_str() + }, + |t: TinyAsciiStr| TinyAsciiStr::is_ascii_alphabetic_uppercase(&t), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_to_ascii_lowercase() { + fn check() { + check_operation( + |s| { + s.chars() + .map(|c| c.to_ascii_lowercase()) + .collect::() + }, + |t: TinyAsciiStr| TinyAsciiStr::to_ascii_lowercase(t).as_str().to_owned(), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_to_ascii_titlecase() { + fn check() { + check_operation( + |s| { + let mut r = s + .chars() + .map(|c| c.to_ascii_lowercase()) + .collect::(); + // Safe because the string is nonempty and an ASCII string + unsafe { r.as_bytes_mut()[0].make_ascii_uppercase() }; + r + }, + |t: TinyAsciiStr| TinyAsciiStr::to_ascii_titlecase(t).as_str().to_owned(), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } + + #[test] + fn test_to_ascii_uppercase() { + fn check() { + check_operation( + |s| { + s.chars() + .map(|c| c.to_ascii_uppercase()) + .collect::() + }, + |t: TinyAsciiStr| TinyAsciiStr::to_ascii_uppercase(t).as_str().to_owned(), + ) + } + check::<2>(); + check::<3>(); + check::<4>(); + check::<5>(); + check::<8>(); + check::<16>(); + } +} diff --git a/third_party/rust/tinystr/src/asciibyte.rs b/third_party/rust/tinystr/src/asciibyte.rs new file mode 100644 index 0000000000..f41a033414 --- /dev/null +++ b/third_party/rust/tinystr/src/asciibyte.rs @@ -0,0 +1,145 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[repr(u8)] +#[allow(dead_code)] +#[derive(PartialEq, Eq, Ord, PartialOrd, Copy, Clone, Hash)] +pub enum AsciiByte { + B0 = 0, + B1 = 1, + B2 = 2, + B3 = 3, + B4 = 4, + B5 = 5, + B6 = 6, + B7 = 7, + B8 = 8, + B9 = 9, + B10 = 10, + B11 = 11, + B12 = 12, + B13 = 13, + B14 = 14, + B15 = 15, + B16 = 16, + B17 = 17, + B18 = 18, + B19 = 19, + B20 = 20, + B21 = 21, + B22 = 22, + B23 = 23, + B24 = 24, + B25 = 25, + B26 = 26, + B27 = 27, + B28 = 28, + B29 = 29, + B30 = 30, + B31 = 31, + B32 = 32, + B33 = 33, + B34 = 34, + B35 = 35, + B36 = 36, + B37 = 37, + B38 = 38, + B39 = 39, + B40 = 40, + B41 = 41, + B42 = 42, + B43 = 43, + B44 = 44, + B45 = 45, + B46 = 46, + B47 = 47, + B48 = 48, + B49 = 49, + B50 = 50, + B51 = 51, + B52 = 52, + B53 = 53, + B54 = 54, + B55 = 55, + B56 = 56, + B57 = 57, + B58 = 58, + B59 = 59, + B60 = 60, + B61 = 61, + B62 = 62, + B63 = 63, + B64 = 64, + B65 = 65, + B66 = 66, + B67 = 67, + B68 = 68, + B69 = 69, + B70 = 70, + B71 = 71, + B72 = 72, + B73 = 73, + B74 = 74, + B75 = 75, + B76 = 76, + B77 = 77, + B78 = 78, + B79 = 79, + B80 = 80, + B81 = 81, + B82 = 82, + B83 = 83, + B84 = 84, + B85 = 85, + B86 = 86, + B87 = 87, + B88 = 88, + B89 = 89, + B90 = 90, + B91 = 91, + B92 = 92, + B93 = 93, + B94 = 94, + B95 = 95, + B96 = 96, + B97 = 97, + B98 = 98, + B99 = 99, + B100 = 100, + B101 = 101, + B102 = 102, + B103 = 103, + B104 = 104, + B105 = 105, + B106 = 106, + B107 = 107, + B108 = 108, + B109 = 109, + B110 = 110, + B111 = 111, + B112 = 112, + B113 = 113, + B114 = 114, + B115 = 115, + B116 = 116, + B117 = 117, + B118 = 118, + B119 = 119, + B120 = 120, + B121 = 121, + B122 = 122, + B123 = 123, + B124 = 124, + B125 = 125, + B126 = 126, + B127 = 127, +} + +impl AsciiByte { + // Convert [u8; N] to [AsciiByte; N] + #[inline] + pub const unsafe fn to_ascii_byte_array(bytes: &[u8; N]) -> [AsciiByte; N] { + *(bytes as *const [u8; N] as *const [AsciiByte; N]) + } +} diff --git a/third_party/rust/tinystr/src/databake.rs b/third_party/rust/tinystr/src/databake.rs new file mode 100644 index 0000000000..e10c194f82 --- /dev/null +++ b/third_party/rust/tinystr/src/databake.rs @@ -0,0 +1,21 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::TinyAsciiStr; +use databake::*; + +impl Bake for TinyAsciiStr { + fn bake(&self, env: &CrateEnv) -> TokenStream { + env.insert("tinystr"); + let string = self.as_str(); + quote! { + ::tinystr::tinystr!(#N, #string) + } + } +} + +#[test] +fn test() { + test_bake!(TinyAsciiStr<10>, const: crate::tinystr!(10usize, "foo"), tinystr); +} diff --git a/third_party/rust/tinystr/src/error.rs b/third_party/rust/tinystr/src/error.rs new file mode 100644 index 0000000000..7910f8b484 --- /dev/null +++ b/third_party/rust/tinystr/src/error.rs @@ -0,0 +1,19 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use displaydoc::Display; + +#[cfg(feature = "std")] +impl std::error::Error for TinyStrError {} + +#[derive(Display, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum TinyStrError { + #[displaydoc("found string of larger length {len} when constructing string of length {max}")] + TooLarge { max: usize, len: usize }, + #[displaydoc("tinystr types do not support strings with null bytes")] + ContainsNull, + #[displaydoc("attempted to construct TinyStrAuto from a non-ascii string")] + NonAscii, +} diff --git a/third_party/rust/tinystr/src/int_ops.rs b/third_party/rust/tinystr/src/int_ops.rs new file mode 100644 index 0000000000..102b052f22 --- /dev/null +++ b/third_party/rust/tinystr/src/int_ops.rs @@ -0,0 +1,315 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::asciibyte::AsciiByte; + +/// Internal helper struct that performs operations on aligned integers. +/// Supports strings up to 4 bytes long. +#[repr(transparent)] +pub struct Aligned4(u32); + +impl Aligned4 { + /// # Panics + /// Panics if N is greater than 4 + #[inline] + pub const fn from_bytes(src: &[u8; N]) -> Self { + let mut bytes = [0; 4]; + let mut i = 0; + // The function documentation defines when panics may occur + #[allow(clippy::indexing_slicing)] + while i < N { + bytes[i] = src[i]; + i += 1; + } + Self(u32::from_ne_bytes(bytes)) + } + + #[inline] + pub const fn from_ascii_bytes(src: &[AsciiByte; N]) -> Self { + Self::from_bytes::(unsafe { core::mem::transmute(src) }) + } + + #[inline] + pub const fn to_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + #[inline] + pub const fn to_ascii_bytes(&self) -> [AsciiByte; 4] { + unsafe { core::mem::transmute(self.to_bytes()) } + } + + pub const fn len(&self) -> usize { + let word = self.0; + #[cfg(target_endian = "little")] + let len = (4 - word.leading_zeros() / 8) as usize; + #[cfg(target_endian = "big")] + let len = (4 - word.trailing_zeros() / 8) as usize; + len + } + + pub const fn is_ascii_alphabetic(&self) -> bool { + let word = self.0; + // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid. + // `mask` sets all NUL bytes to 0. + let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; + // `lower` converts the string to lowercase. It may also change the value of non-alpha + // characters, but this does not matter for the alphabetic test that follows. + let lower = word | 0x2020_2020; + // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters. + let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505); + // The overall string is valid if every character passes at least one test. + // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`). + (alpha & mask) == 0 + } + + pub const fn is_ascii_alphanumeric(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_alphabetic + let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; + let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646); + let lower = word | 0x2020_2020; + let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505); + (alpha & numeric & mask) == 0 + } + + pub const fn is_ascii_numeric(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_alphabetic + let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; + let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646); + (numeric & mask) == 0 + } + + pub const fn is_ascii_lowercase(&self) -> bool { + let word = self.0; + // For efficiency, this function tests for an invalid string rather than a valid string. + // A string is ASCII lowercase iff it contains no uppercase ASCII characters. + // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1. + let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525); + // The string is valid if it contains no invalid characters (if all high bits are 1). + (invalid_case & 0x8080_8080) == 0x8080_8080 + } + + pub const fn is_ascii_titlecase(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_lowercase + let invalid_case = if cfg!(target_endian = "little") { + !(word + 0x3f3f_3f1f) | (word + 0x2525_2505) + } else { + !(word + 0x1f3f_3f3f) | (word + 0x0525_2525) + }; + (invalid_case & 0x8080_8080) == 0x8080_8080 + } + + pub const fn is_ascii_uppercase(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_lowercase + let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505); + (invalid_case & 0x8080_8080) == 0x8080_8080 + } + + pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { + let word = self.0; + // `mask` sets all NUL bytes to 0. + let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; + // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1. + let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505); + // The overall string is valid if every character passes at least one test. + // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`). + (lower_alpha & mask) == 0 + } + + pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_alphabetic_lowercase + let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; + let title_case = if cfg!(target_endian = "little") { + !(word + 0x1f1f_1f3f) | (word + 0x0505_0525) + } else { + !(word + 0x3f1f_1f1f) | (word + 0x2505_0505) + }; + (title_case & mask) == 0 + } + + pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_alphabetic_lowercase + let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; + let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525); + (upper_alpha & mask) == 0 + } + + pub const fn to_ascii_lowercase(&self) -> Self { + let word = self.0; + let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2); + Self(result) + } + + pub const fn to_ascii_titlecase(&self) -> Self { + let word = self.0.to_le(); + let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2; + let result = (word | mask) & !(0x20 & mask); + Self(u32::from_le(result)) + } + + pub const fn to_ascii_uppercase(&self) -> Self { + let word = self.0; + let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2); + Self(result) + } +} + +/// Internal helper struct that performs operations on aligned integers. +/// Supports strings up to 8 bytes long. +#[repr(transparent)] +pub struct Aligned8(u64); + +impl Aligned8 { + /// # Panics + /// Panics if N is greater than 8 + #[inline] + pub const fn from_bytes(src: &[u8; N]) -> Self { + let mut bytes = [0; 8]; + let mut i = 0; + // The function documentation defines when panics may occur + #[allow(clippy::indexing_slicing)] + while i < N { + bytes[i] = src[i]; + i += 1; + } + Self(u64::from_ne_bytes(bytes)) + } + + #[inline] + pub const fn from_ascii_bytes(src: &[AsciiByte; N]) -> Self { + Self::from_bytes::(unsafe { core::mem::transmute(src) }) + } + + #[inline] + pub const fn to_bytes(&self) -> [u8; 8] { + self.0.to_ne_bytes() + } + + #[inline] + pub const fn to_ascii_bytes(&self) -> [AsciiByte; 8] { + unsafe { core::mem::transmute(self.to_bytes()) } + } + + pub const fn len(&self) -> usize { + let word = self.0; + #[cfg(target_endian = "little")] + let len = (8 - word.leading_zeros() / 8) as usize; + #[cfg(target_endian = "big")] + let len = (8 - word.trailing_zeros() / 8) as usize; + len + } + + pub const fn is_ascii_alphabetic(&self) -> bool { + let word = self.0; + let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; + let lower = word | 0x2020_2020_2020_2020; + let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505); + (alpha & mask) == 0 + } + + pub const fn is_ascii_alphanumeric(&self) -> bool { + let word = self.0; + let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; + let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646); + let lower = word | 0x2020_2020_2020_2020; + let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505); + (alpha & numeric & mask) == 0 + } + + pub const fn is_ascii_numeric(&self) -> bool { + let word = self.0; + let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; + let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646); + (numeric & mask) == 0 + } + + pub const fn is_ascii_lowercase(&self) -> bool { + let word = self.0; + let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525); + (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 + } + + pub const fn is_ascii_titlecase(&self) -> bool { + let word = self.0; + let invalid_case = if cfg!(target_endian = "little") { + !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505) + } else { + !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525) + }; + (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 + } + + pub const fn is_ascii_uppercase(&self) -> bool { + let word = self.0; + let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505); + (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 + } + + pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { + let word = self.0; + // `mask` sets all NUL bytes to 0. + let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; + // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1. + let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505); + // The overall string is valid if every character passes at least one test. + // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`). + (lower_alpha & mask) == 0 + } + + pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_alphabetic_lowercase + let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; + let title_case = if cfg!(target_endian = "little") { + !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525) + } else { + !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505) + }; + (title_case & mask) == 0 + } + + pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { + let word = self.0; + // See explanatory comments in is_ascii_alphabetic_lowercase + let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; + let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525); + (upper_alpha & mask) == 0 + } + + pub const fn to_ascii_lowercase(&self) -> Self { + let word = self.0; + let result = word + | (((word + 0x3f3f_3f3f_3f3f_3f3f) + & !(word + 0x2525_2525_2525_2525) + & 0x8080_8080_8080_8080) + >> 2); + Self(result) + } + + pub const fn to_ascii_titlecase(&self) -> Self { + let word = self.0.to_le(); + let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f) + & !(word + 0x2525_2525_2525_2505) + & 0x8080_8080_8080_8080) + >> 2; + let result = (word | mask) & !(0x20 & mask); + Self(u64::from_le(result)) + } + + pub const fn to_ascii_uppercase(&self) -> Self { + let word = self.0; + let result = word + & !(((word + 0x1f1f_1f1f_1f1f_1f1f) + & !(word + 0x0505_0505_0505_0505) + & 0x8080_8080_8080_8080) + >> 2); + Self(result) + } +} diff --git a/third_party/rust/tinystr/src/lib.rs b/third_party/rust/tinystr/src/lib.rs new file mode 100644 index 0000000000..7745da0e54 --- /dev/null +++ b/third_party/rust/tinystr/src/lib.rs @@ -0,0 +1,116 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! `tinystr` is a utility crate of the [`ICU4X`] project. +//! +//! It includes [`TinyAsciiStr`], a core API for representing small ASCII-only bounded length strings. +//! +//! It is optimized for operations on strings of size 8 or smaller. When use cases involve comparison +//! and conversion of strings for lowercase/uppercase/titlecase, or checking +//! numeric/alphabetic/alphanumeric, `TinyAsciiStr` is the edge performance library. +//! +//! # Examples +//! +//! ```rust +//! use tinystr::TinyAsciiStr; +//! +//! let s1: TinyAsciiStr<4> = "tEsT".parse().expect("Failed to parse."); +//! +//! assert_eq!(s1, "tEsT"); +//! assert_eq!(s1.to_ascii_uppercase(), "TEST"); +//! assert_eq!(s1.to_ascii_lowercase(), "test"); +//! assert_eq!(s1.to_ascii_titlecase(), "Test"); +//! assert!(s1.is_ascii_alphanumeric()); +//! assert!(!s1.is_ascii_numeric()); +//! +//! let s2 = TinyAsciiStr::<8>::try_from_raw(*b"New York") +//! .expect("Failed to parse."); +//! +//! assert_eq!(s2, "New York"); +//! assert_eq!(s2.to_ascii_uppercase(), "NEW YORK"); +//! assert_eq!(s2.to_ascii_lowercase(), "new york"); +//! assert_eq!(s2.to_ascii_titlecase(), "New york"); +//! assert!(!s2.is_ascii_alphanumeric()); +//! ``` +//! +//! # Details +//! +//! When strings are of size 8 or smaller, the struct transforms the strings as `u32`/`u64` and uses +//! bitmasking to provide basic string manipulation operations: +//! * `is_ascii_numeric` +//! * `is_ascii_alphabetic` +//! * `is_ascii_alphanumeric` +//! * `to_ascii_lowercase` +//! * `to_ascii_uppercase` +//! * `to_ascii_titlecase` +//! * `PartialEq` +//! +//! `TinyAsciiStr` will fall back to `u8` character manipulation for strings of length greater than 8. + +//! +//! [`ICU4X`]: ../icu/index.html + +// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations +#![cfg_attr(not(any(test, feature = "std")), no_std)] +#![cfg_attr( + not(test), + deny( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::exhaustive_structs, + clippy::exhaustive_enums, + missing_debug_implementations, + ) +)] + +mod macros; + +mod ascii; +mod asciibyte; +mod error; +mod int_ops; + +#[cfg(feature = "serde")] +mod serde; + +#[cfg(feature = "databake")] +mod databake; + +#[cfg(feature = "zerovec")] +mod ule; + +#[cfg(any(feature = "serde", feature = "alloc"))] +extern crate alloc; + +pub use ascii::TinyAsciiStr; +pub use error::TinyStrError; + +/// These are temporary compatability reexports that will be removed +/// in a future version. +pub type TinyStr4 = TinyAsciiStr<4>; +/// These are temporary compatability reexports that will be removed +/// in a future version. +pub type TinyStr8 = TinyAsciiStr<8>; +/// These are temporary compatability reexports that will be removed +/// in a future version. +pub type TinyStr16 = TinyAsciiStr<16>; + +#[test] +fn test_size() { + assert_eq!( + core::mem::size_of::(), + core::mem::size_of::>() + ); + assert_eq!( + core::mem::size_of::(), + core::mem::size_of::>() + ); +} +// /// Allows unit tests to use the macro +// #[cfg(test)] +// mod tinystr { +// pub use super::{TinyAsciiStr, TinyStrError}; +// } diff --git a/third_party/rust/tinystr/src/macros.rs b/third_party/rust/tinystr/src/macros.rs new file mode 100644 index 0000000000..b00185238e --- /dev/null +++ b/third_party/rust/tinystr/src/macros.rs @@ -0,0 +1,32 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[macro_export] +macro_rules! tinystr { + ($n:literal, $s:literal) => {{ + // Force it into a const context; otherwise it may get evaluated at runtime instead. + const TINYSTR_MACRO_CONST: $crate::TinyAsciiStr<$n> = { + match $crate::TinyAsciiStr::from_bytes($s.as_bytes()) { + Ok(s) => s, + // We are okay with panicking here because this is in a const context + #[allow(clippy::panic)] + // Cannot format the error since formatting isn't const yet + Err(_) => panic!(concat!("Failed to construct tinystr from ", $s)), + } + }; + TINYSTR_MACRO_CONST + }}; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_macro_construction() { + let s1 = tinystr!(8, "foobar"); + assert_eq!(&*s1, "foobar"); + + let s1 = tinystr!(12, "foobarbaz"); + assert_eq!(&*s1, "foobarbaz"); + } +} diff --git a/third_party/rust/tinystr/src/serde.rs b/third_party/rust/tinystr/src/serde.rs new file mode 100644 index 0000000000..933491f178 --- /dev/null +++ b/third_party/rust/tinystr/src/serde.rs @@ -0,0 +1,91 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::TinyAsciiStr; +use alloc::borrow::Cow; +use alloc::string::ToString; +use core::fmt; +use core::marker::PhantomData; +use core::ops::Deref; +use serde::de::{Error, SeqAccess, Visitor}; +use serde::ser::SerializeTuple; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +impl Serialize for TinyAsciiStr { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if serializer.is_human_readable() { + self.deref().serialize(serializer) + } else { + let mut seq = serializer.serialize_tuple(N)?; + for byte in self.all_bytes() { + seq.serialize_element(byte)?; + } + seq.end() + } + } +} + +struct TinyAsciiStrVisitor { + marker: PhantomData>, +} + +impl TinyAsciiStrVisitor { + fn new() -> Self { + TinyAsciiStrVisitor { + marker: PhantomData, + } + } +} + +impl<'de, const N: usize> Visitor<'de> for TinyAsciiStrVisitor { + type Value = TinyAsciiStr; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "a TinyAsciiStr<{}>", N) + } + + #[inline] + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + let mut bytes = [0u8; N]; + let mut zeroes = false; + for out in &mut bytes.iter_mut().take(N) { + let byte = seq + .next_element()? + .ok_or_else(|| Error::invalid_length(N, &self))?; + if byte == 0 { + zeroes = true; + } else if zeroes { + return Err(Error::custom("TinyAsciiStr cannot contain null bytes")); + } + + if byte >= 0x80 { + return Err(Error::custom("TinyAsciiStr cannot contain non-ascii bytes")); + } + *out = byte; + } + + Ok(unsafe { TinyAsciiStr::from_bytes_unchecked(bytes) }) + } +} + +impl<'de, const N: usize> Deserialize<'de> for TinyAsciiStr { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + if deserializer.is_human_readable() { + let x: Cow<'de, str> = Deserialize::deserialize(deserializer)?; + TinyAsciiStr::from_str(&x).map_err(|e| Error::custom(e.to_string())) + } else { + deserializer.deserialize_tuple(N, TinyAsciiStrVisitor::::new()) + } + } +} diff --git a/third_party/rust/tinystr/src/ule.rs b/third_party/rust/tinystr/src/ule.rs new file mode 100644 index 0000000000..0fa212095f --- /dev/null +++ b/third_party/rust/tinystr/src/ule.rs @@ -0,0 +1,76 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::TinyAsciiStr; +use zerovec::maps::ZeroMapKV; +use zerovec::ule::*; +use zerovec::{ZeroSlice, ZeroVec}; + +// Safety (based on the safety checklist on the ULE trait): +// 1. CharULE does not include any uninitialized or padding bytes. +// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) +// 2. CharULE is aligned to 1 byte. +// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) +// 3. The impl of validate_byte_slice() returns an error if any byte is not valid. +// 4. The impl of validate_byte_slice() returns an error if there are extra bytes. +// 5. The other ULE methods use the default impl. +// 6. CharULE byte equality is semantic equality +unsafe impl ULE for TinyAsciiStr { + #[inline] + fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> { + if bytes.len() % N != 0 { + return Err(ZeroVecError::length::(bytes.len())); + } + // Validate the bytes + for chunk in bytes.chunks_exact(N) { + let _ = TinyAsciiStr::::from_bytes_inner(chunk, 0, N, true) + .map_err(|_| ZeroVecError::parse::())?; + } + Ok(()) + } +} + +impl AsULE for TinyAsciiStr { + type ULE = Self; + + #[inline] + fn to_unaligned(self) -> Self::ULE { + self + } + + #[inline] + fn from_unaligned(unaligned: Self::ULE) -> Self { + unaligned + } +} + +impl<'a, const N: usize> ZeroMapKV<'a> for TinyAsciiStr { + type Container = ZeroVec<'a, TinyAsciiStr>; + type Slice = ZeroSlice>; + type GetType = TinyAsciiStr; + type OwnedType = TinyAsciiStr; +} + +#[cfg(test)] +mod test { + use crate::*; + use zerovec::*; + + #[test] + fn test_zerovec() { + let mut vec = ZeroVec::>::new(); + + vec.with_mut(|v| v.push("foobar".parse().unwrap())); + vec.with_mut(|v| v.push("baz".parse().unwrap())); + vec.with_mut(|v| v.push("quux".parse().unwrap())); + + let bytes = vec.as_bytes(); + + let vec: ZeroVec> = ZeroVec::parse_byte_slice(bytes).unwrap(); + + assert_eq!(&*vec.get(0).unwrap(), "foobar"); + assert_eq!(&*vec.get(1).unwrap(), "baz"); + assert_eq!(&*vec.get(2).unwrap(), "quux"); + } +} diff --git a/third_party/rust/tinystr/tests/serde.rs b/third_party/rust/tinystr/tests/serde.rs new file mode 100644 index 0000000000..282914e6fc --- /dev/null +++ b/third_party/rust/tinystr/tests/serde.rs @@ -0,0 +1,39 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use tinystr::*; + +// Tests largely adapted from `tinystr` crate +// https://github.com/zbraniecki/tinystr/blob/4e4eab55dd6bded7f29a18b41452c506c461716c/tests/serde.rs + +macro_rules! test_roundtrip { + ($f:ident, $n:literal, $val:expr) => { + #[test] + fn $f() { + let tiny: TinyAsciiStr<$n> = $val.parse().unwrap(); + let json_string = serde_json::to_string(&tiny).unwrap(); + let expected_json = concat!("\"", $val, "\""); + assert_eq!(json_string, expected_json); + let recover: TinyAsciiStr<$n> = serde_json::from_str(&json_string).unwrap(); + assert_eq!(&*tiny, &*recover); + + let bin = bincode::serialize(&tiny).unwrap(); + assert_eq!(bin, &tiny.all_bytes()[..]); + let debin: TinyAsciiStr<$n> = bincode::deserialize(&bin).unwrap(); + assert_eq!(&*tiny, &*debin); + + let post = postcard::to_stdvec(&tiny).unwrap(); + assert_eq!(post, &tiny.all_bytes()[..]); + let unpost: TinyAsciiStr<$n> = postcard::from_bytes(&post).unwrap(); + assert_eq!(&*tiny, &*unpost); + } + }; +} + +test_roundtrip!(test_roundtrip4_1, 4, "en"); +test_roundtrip!(test_roundtrip4_2, 4, "Latn"); +test_roundtrip!(test_roundtrip8, 8, "calendar"); +test_roundtrip!(test_roundtrip16, 16, "verylongstring"); +test_roundtrip!(test_roundtrip10, 11, "shortstring"); +test_roundtrip!(test_roundtrip30, 24, "veryveryverylongstring"); -- cgit v1.2.3