From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- library/core/src/str/count.rs | 136 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 library/core/src/str/count.rs (limited to 'library/core/src/str/count.rs') diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs new file mode 100644 index 000000000..28567a7e7 --- /dev/null +++ b/library/core/src/str/count.rs @@ -0,0 +1,136 @@ +//! Code for efficiently counting the number of `char`s in a UTF-8 encoded +//! string. +//! +//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`, +//! followed by some number (possibly 0) of continuation bytes. +//! +//! The leading byte can have a number of bit-patterns (with the specific +//! pattern indicating how many continuation bytes follow), but the continuation +//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any +//! value). That is, the most significant bit is set, and the second most +//! significant bit is unset. +//! +//! To count the number of characters, we can just count the number of bytes in +//! the string which are not continuation bytes, which can be done many bytes at +//! a time fairly easily. +//! +//! Note: Because the term "leading byte" can sometimes be ambiguous (for +//! example, it could also refer to the first byte of a slice), we'll often use +//! the term "non-continuation byte" to refer to these bytes in the code. +use core::intrinsics::unlikely; + +const USIZE_SIZE: usize = core::mem::size_of::(); +const UNROLL_INNER: usize = 4; + +#[inline] +pub(super) fn count_chars(s: &str) -> usize { + if s.len() < USIZE_SIZE * UNROLL_INNER { + // Avoid entering the optimized implementation for strings where the + // difference is not likely to matter, or where it might even be slower. + // That said, a ton of thought was not spent on the particular threshold + // here, beyond "this value seems to make sense". + char_count_general_case(s.as_bytes()) + } else { + do_count_chars(s) + } +} + +fn do_count_chars(s: &str) -> usize { + // For correctness, `CHUNK_SIZE` must be: + // + // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. + // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the + // `body.chunks(CHUNK_SIZE)` loop is incorrect. + // + // For performance, `CHUNK_SIZE` should be: + // - Relatively cheap to `/` against (so some simple sum of powers of two). + // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize` + // too often. + const CHUNK_SIZE: usize = 192; + + // Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required + // for correctness. + const _: () = assert!(CHUNK_SIZE < 256); + const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == 0); + + // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size + // differences which are handled by `align_to`. + let (head, body, tail) = unsafe { s.as_bytes().align_to::() }; + + // This should be quite rare, and basically exists to handle the degenerate + // cases where align_to fails (as well as miri under symbolic alignment + // mode). + // + // The `unlikely` helps discourage LLVM from inlining the body, which is + // nice, as we would rather not mark the `char_count_general_case` function + // as cold. + if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) { + return char_count_general_case(s.as_bytes()); + } + + let mut total = char_count_general_case(head) + char_count_general_case(tail); + // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which + // we call `sum_bytes_in_usize`. + for chunk in body.chunks(CHUNK_SIZE) { + // We accumulate intermediate sums in `counts`, where each byte contains + // a subset of the sum of this chunk, like a `[u8; size_of::()]`. + let mut counts = 0; + + let (unrolled_chunks, remainder) = chunk.as_chunks::(); + for unrolled in unrolled_chunks { + for &word in unrolled { + // Because `CHUNK_SIZE` is < 256, this addition can't cause the + // count in any of the bytes to overflow into a subsequent byte. + counts += contains_non_continuation_byte(word); + } + } + + // Sum the values in `counts` (which, again, is conceptually a `[u8; + // size_of::()]`), and accumulate the result into `total`. + total += sum_bytes_in_usize(counts); + + // If there's any data in `remainder`, then handle it. This will only + // happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE` + // is divisible by `UNROLL_INNER`), so we explicitly break at the end + // (which seems to help LLVM out). + if !remainder.is_empty() { + // Accumulate all the data in the remainder. + let mut counts = 0; + for &word in remainder { + counts += contains_non_continuation_byte(word); + } + total += sum_bytes_in_usize(counts); + break; + } + } + total +} + +// Checks each byte of `w` to see if it contains the first byte in a UTF-8 +// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g. +// false), and bytes which are non-continuation bytes are left as `0x01` (e.g. +// true) +#[inline] +fn contains_non_continuation_byte(w: usize) -> usize { + const LSB: usize = usize::repeat_u8(0x01); + ((!w >> 7) | (w >> 6)) & LSB +} + +// Morally equivalent to `values.to_ne_bytes().into_iter().sum::()`, but +// more efficient. +#[inline] +fn sum_bytes_in_usize(values: usize) -> usize { + const LSB_SHORTS: usize = usize::repeat_u16(0x0001); + const SKIP_BYTES: usize = usize::repeat_u16(0x00ff); + + let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES); + pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8) +} + +// This is the most direct implementation of the concept of "count the number of +// bytes in the string which are not continuation bytes", and is used for the +// head and tail of the input string (the first and last item in the tuple +// returned by `slice::align_to`). +fn char_count_general_case(s: &[u8]) -> usize { + s.iter().filter(|&&byte| !super::validations::utf8_is_cont_byte(byte)).count() +} -- cgit v1.2.3