From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/rust/regex-automata/src/util/utf8.rs | 196 +++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 third_party/rust/regex-automata/src/util/utf8.rs (limited to 'third_party/rust/regex-automata/src/util/utf8.rs') diff --git a/third_party/rust/regex-automata/src/util/utf8.rs b/third_party/rust/regex-automata/src/util/utf8.rs new file mode 100644 index 0000000000..91b27efe0f --- /dev/null +++ b/third_party/rust/regex-automata/src/util/utf8.rs @@ -0,0 +1,196 @@ +/*! +Utilities for dealing with UTF-8. + +This module provides some UTF-8 related helper routines, including an +incremental decoder. +*/ + +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +/// +/// This was copied from regex-syntax so that we can use it to determine the +/// starting DFA state while searching without depending on regex-syntax. The +/// definition is never going to change, so there's no maintenance/bit-rot +/// hazard here. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_word_byte(b: u8) -> bool { + const fn mkwordset() -> [bool; 256] { + // FIXME: Use as_usize() once const functions in traits are stable. + let mut set = [false; 256]; + set[b'_' as usize] = true; + + let mut byte = b'0'; + while byte <= b'9' { + set[byte as usize] = true; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + set[byte as usize] = true; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + set[byte as usize] = true; + byte += 1; + } + set + } + const WORD: [bool; 256] = mkwordset(); + WORD[b as usize] +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +/// +/// This never panics. +/// +/// *WARNING*: This is not designed for performance. If you're looking for a +/// fast UTF-8 decoder, this is not it. If you feel like you need one in this +/// crate, then please file an issue and discuss your use case. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Decodes the last UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the end of the given byte +/// slice, then the last byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode_last(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + let mut start = bytes.len() - 1; + let limit = bytes.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_byte(bytes[start]) { + start -= 1; + } + match decode(&bytes[start..]) { + None => None, + Some(Ok(ch)) => Some(Ok(ch)), + Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn len(byte: u8) -> Option { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +/// Returns true if and only if the given offset in the given bytes falls on a +/// valid UTF-8 encoded codepoint boundary. +/// +/// If `bytes` is not valid UTF-8, then the behavior of this routine is +/// unspecified. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { + match bytes.get(i) { + // The position at the end of the bytes always represents an empty + // string, which is a valid boundary. But anything after that doesn't + // make much sense to call valid a boundary. + None => i == bytes.len(), + // Other than ASCII (where the most significant bit is never set), + // valid starting bytes always have their most significant two bits + // set, where as continuation bytes never have their second most + // significant bit set. Therefore, this only returns true when bytes[i] + // corresponds to a byte that begins a valid UTF-8 encoding of a + // Unicode scalar value. + Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, + } +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn is_leading_or_invalid_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +/* +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +/// +/// For all inputs, including invalid UTF-8 and any value of `i`, the return +/// value is guaranteed to be greater than `i`. (If there is no value greater +/// than `i` that fits in `usize`, then this panics.) +/// +/// Generally speaking, this should only be called on `text` when it is +/// permitted to assume that it is valid UTF-8 and where either `i >= +/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. +/// +/// NOTE: This method was used in a previous conception of iterators where we +/// specifically tried to skip over empty matches that split a codepoint by +/// simply requiring that our next search begin at the beginning of codepoint. +/// But we ended up changing that technique to always advance by 1 byte and +/// then filter out matches that split a codepoint after-the-fact. Thus, we no +/// longer use this method. But I've kept it around in case we want to switch +/// back to this approach. Its guarantees are a little subtle, so I'd prefer +/// not to rebuild it from whole cloth. +pub(crate) fn next(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i.checked_add(1).unwrap(), + Some(&b) => b, + }; + // For cases where we see an invalid UTF-8 byte, there isn't much we can do + // other than just start at the next byte. + let inc = len(b).unwrap_or(1); + i.checked_add(inc).unwrap() +} +*/ -- cgit v1.2.3