use core::{char, cmp, fmt, str}; #[cfg(feature = "std")] use std::error; use crate::{ascii, bstr::BStr, ext_slice::ByteSlice}; // The UTF-8 decoder provided here is based on the one presented here: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}` // using regex-automata that is roughly the same size. The real benefit of // Hoehrmann's formulation is that the byte class mapping below is manually // tailored such that each byte's class doubles as a shift to mask out the // bits necessary for constructing the leading bits of each codepoint value // from the initial byte. // // There are some minor differences between this implementation and Hoehrmann's // formulation. // // Firstly, we make REJECT have state ID 0, since it makes the state table // itself a little easier to read and is consistent with the notion that 0 // means "false" or "bad." // // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast // path. // // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction // in the core decoding loop. (Which is what regex-automata would do by // default.) // // Fourthly, we split the byte class mapping and transition table into two // arrays because it's clearer. // // It is unlikely that this is the fastest way to do UTF-8 decoding, however, // it is fairly simple. const ACCEPT: usize = 12; const REJECT: usize = 0; /// SAFETY: The decode below function relies on the correctness of these /// equivalence classes. #[cfg_attr(rustfmt, rustfmt::skip)] const CLASSES: [u8; 256] = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, ]; /// SAFETY: The decode below function relies on the correctness of this state /// machine. #[cfg_attr(rustfmt, rustfmt::skip)] const STATES_FORWARD: &'static [u8] = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; /// An iterator over Unicode scalar values in a byte string. /// /// When invalid UTF-8 byte sequences are found, they are substituted with the /// Unicode replacement codepoint (`U+FFFD`) using the /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). /// /// This iterator is created by the /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. #[derive(Clone, Debug)] pub struct Chars<'a> { bs: &'a [u8], } impl<'a> Chars<'a> { pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> { Chars { bs } } /// View the underlying data as a subslice of the original data. /// /// The slice returned has the same lifetime as the original slice, and so /// the iterator can continue to be used while this exists. /// /// # Examples /// /// ``` /// use bstr::ByteSlice; /// /// let mut chars = b"abc".chars(); /// /// assert_eq!(b"abc", chars.as_bytes()); /// chars.next(); /// assert_eq!(b"bc", chars.as_bytes()); /// chars.next(); /// chars.next(); /// assert_eq!(b"", chars.as_bytes()); /// ``` #[inline] pub fn as_bytes(&self) -> &'a [u8] { self.bs } } impl<'a> Iterator for Chars<'a> { type Item = char; #[inline] fn next(&mut self) -> Option { let (ch, size) = decode_lossy(self.bs); if size == 0 { return None; } self.bs = &self.bs[size..]; Some(ch) } } impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { let (ch, size) = decode_last_lossy(self.bs); if size == 0 { return None; } self.bs = &self.bs[..self.bs.len() - size]; Some(ch) } } /// An iterator over Unicode scalar values in a byte string and their /// byte index positions. /// /// When invalid UTF-8 byte sequences are found, they are substituted with the /// Unicode replacement codepoint (`U+FFFD`) using the /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). /// /// Note that this is slightly different from the `CharIndices` iterator /// provided by the standard library. Aside from working on possibly invalid /// UTF-8, this iterator provides both the corresponding starting and ending /// byte indices of each codepoint yielded. The ending position is necessary to /// slice the original byte string when invalid UTF-8 bytes are converted into /// a Unicode replacement codepoint, since a single replacement codepoint can /// substitute anywhere from 1 to 3 invalid bytes (inclusive). /// /// This iterator is created by the /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. #[derive(Clone, Debug)] pub struct CharIndices<'a> { bs: &'a [u8], forward_index: usize, reverse_index: usize, } impl<'a> CharIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { CharIndices { bs, forward_index: 0, reverse_index: bs.len() } } /// View the underlying data as a subslice of the original data. /// /// The slice returned has the same lifetime as the original slice, and so /// the iterator can continue to be used while this exists. /// /// # Examples /// /// ``` /// use bstr::ByteSlice; /// /// let mut it = b"abc".char_indices(); /// /// assert_eq!(b"abc", it.as_bytes()); /// it.next(); /// assert_eq!(b"bc", it.as_bytes()); /// it.next(); /// it.next(); /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] pub fn as_bytes(&self) -> &'a [u8] { self.bs } } impl<'a> Iterator for CharIndices<'a> { type Item = (usize, usize, char); #[inline] fn next(&mut self) -> Option<(usize, usize, char)> { let index = self.forward_index; let (ch, size) = decode_lossy(self.bs); if size == 0 { return None; } self.bs = &self.bs[size..]; self.forward_index += size; Some((index, index + size, ch)) } } impl<'a> DoubleEndedIterator for CharIndices<'a> { #[inline] fn next_back(&mut self) -> Option<(usize, usize, char)> { let (ch, size) = decode_last_lossy(self.bs); if size == 0 { return None; } self.bs = &self.bs[..self.bs.len() - size]; self.reverse_index -= size; Some((self.reverse_index, self.reverse_index + size, ch)) } } impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {} /// An iterator over chunks of valid UTF-8 in a byte slice. /// /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks). #[derive(Clone, Debug)] pub struct Utf8Chunks<'a> { pub(super) bytes: &'a [u8], } /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes. /// /// This is yielded by the /// [`Utf8Chunks`](struct.Utf8Chunks.html) /// iterator, which can be created via the /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks) /// method. /// /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that /// are being iterated over. #[cfg_attr(test, derive(Debug, PartialEq))] pub struct Utf8Chunk<'a> { /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes. /// /// This is empty between adjacent invalid UTF-8 byte sequences. valid: &'a str, /// A sequence of invalid UTF-8 bytes. /// /// Can only be empty in the last chunk. /// /// Should be replaced by a single unicode replacement character, if not /// empty. invalid: &'a BStr, /// Indicates whether the invalid sequence could've been valid if there /// were more bytes. /// /// Can only be true in the last chunk. incomplete: bool, } impl<'a> Utf8Chunk<'a> { /// Returns the (possibly empty) valid UTF-8 bytes in this chunk. /// /// This may be empty if there are consecutive sequences of invalid UTF-8 /// bytes. #[inline] pub fn valid(&self) -> &'a str { self.valid } /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that /// immediately follow the valid UTF-8 bytes in this chunk. /// /// This is only empty when this chunk corresponds to the last chunk in /// the original bytes. /// /// The maximum length of this slice is 3. That is, invalid UTF-8 byte /// sequences greater than 1 always correspond to a valid _prefix_ of /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution /// of maximal subparts" strategy that is described in more detail in the /// docs for the /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) /// method. #[inline] pub fn invalid(&self) -> &'a [u8] { self.invalid.as_bytes() } /// Returns whether the invalid sequence might still become valid if more /// bytes are added. /// /// Returns true if the end of the input was reached unexpectedly, /// without encountering an unexpected byte. /// /// This can only be the case for the last chunk. #[inline] pub fn incomplete(&self) -> bool { self.incomplete } } impl<'a> Iterator for Utf8Chunks<'a> { type Item = Utf8Chunk<'a>; #[inline] fn next(&mut self) -> Option> { if self.bytes.is_empty() { return None; } match validate(self.bytes) { Ok(()) => { let valid = self.bytes; self.bytes = &[]; Some(Utf8Chunk { // SAFETY: This is safe because of the guarantees provided // by utf8::validate. valid: unsafe { str::from_utf8_unchecked(valid) }, invalid: [].as_bstr(), incomplete: false, }) } Err(e) => { let (valid, rest) = self.bytes.split_at(e.valid_up_to()); // SAFETY: This is safe because of the guarantees provided by // utf8::validate. let valid = unsafe { str::from_utf8_unchecked(valid) }; let (invalid_len, incomplete) = match e.error_len() { Some(n) => (n, false), None => (rest.len(), true), }; let (invalid, rest) = rest.split_at(invalid_len); self.bytes = rest; Some(Utf8Chunk { valid, invalid: invalid.as_bstr(), incomplete, }) } } } #[inline] fn size_hint(&self) -> (usize, Option) { if self.bytes.is_empty() { (0, Some(0)) } else { (1, Some(self.bytes.len())) } } } impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {} /// An error that occurs when UTF-8 decoding fails. /// /// This error occurs when attempting to convert a non-UTF-8 byte /// string to a Rust string that must be valid UTF-8. For example, /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method. /// /// # Example /// /// This example shows what happens when a given byte sequence is invalid, /// but ends with a sequence that is a possible prefix of valid UTF-8. /// /// ``` /// use bstr::{B, ByteSlice}; /// /// let s = B(b"foobar\xF1\x80\x80"); /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// assert_eq!(err.error_len(), None); /// ``` /// /// This example shows what happens when a given byte sequence contains /// invalid UTF-8. /// /// ``` /// use bstr::ByteSlice; /// /// let s = b"foobar\xF1\x80\x80quux"; /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// // The error length reports the maximum number of bytes that correspond to /// // a valid prefix of a UTF-8 encoded codepoint. /// assert_eq!(err.error_len(), Some(3)); /// /// // In contrast to the above which contains a single invalid prefix, /// // consider the case of multiple individual bytes that are never valid /// // prefixes. Note how the value of error_len changes! /// let s = b"foobar\xFF\xFFquux"; /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// assert_eq!(err.error_len(), Some(1)); /// /// // The fact that it's an invalid prefix does not change error_len even /// // when it immediately precedes the end of the string. /// let s = b"foobar\xFF"; /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// assert_eq!(err.error_len(), Some(1)); /// ``` #[derive(Clone, Debug, Eq, PartialEq)] pub struct Utf8Error { valid_up_to: usize, error_len: Option, } impl Utf8Error { /// Returns the byte index of the position immediately following the last /// valid UTF-8 byte. /// /// # Example /// /// This examples shows how `valid_up_to` can be used to retrieve a /// possibly empty prefix that is guaranteed to be valid UTF-8: /// /// ``` /// use bstr::ByteSlice; /// /// let s = b"foobar\xF1\x80\x80quux"; /// let err = s.to_str().unwrap_err(); /// /// // This is guaranteed to never panic. /// let string = s[..err.valid_up_to()].to_str().unwrap(); /// assert_eq!(string, "foobar"); /// ``` #[inline] pub fn valid_up_to(&self) -> usize { self.valid_up_to } /// Returns the total number of invalid UTF-8 bytes immediately following /// the position returned by `valid_up_to`. This value is always at least /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8 /// encoded codepoint. /// /// If the end of the original input was found before a valid UTF-8 encoded /// codepoint could be completed, then this returns `None`. This is useful /// when processing streams, where a `None` value signals that more input /// might be needed. #[inline] pub fn error_len(&self) -> Option { self.error_len } } #[cfg(feature = "std")] impl error::Error for Utf8Error { fn description(&self) -> &str { "invalid UTF-8" } } impl fmt::Display for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to) } } /// Returns OK if and only if the given slice is completely valid UTF-8. /// /// If the slice isn't valid UTF-8, then an error is returned that explains /// the first location at which invalid UTF-8 was detected. pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { // The fast path for validating UTF-8. It steps through a UTF-8 automaton // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is // detected, it backs up and runs the slower version of the UTF-8 automaton // to determine correct error information. fn fast(slice: &[u8]) -> Result<(), Utf8Error> { let mut state = ACCEPT; let mut i = 0; while i < slice.len() { let b = slice[i]; // ASCII fast path. If we see two consecutive ASCII bytes, then try // to validate as much ASCII as possible very quickly. if state == ACCEPT && b <= 0x7F && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) { i += ascii::first_non_ascii_byte(&slice[i..]); continue; } state = step(state, b); if state == REJECT { return Err(find_valid_up_to(slice, i)); } i += 1; } if state != ACCEPT { Err(find_valid_up_to(slice, slice.len())) } else { Ok(()) } } // Given the first position at which a UTF-8 sequence was determined to be // invalid, return an error that correctly reports the position at which // the last complete UTF-8 sequence ends. #[inline(never)] fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { // In order to find the last valid byte, we need to back up an amount // that guarantees every preceding byte is part of a valid UTF-8 // code unit sequence. To do this, we simply locate the last leading // byte that occurs before rejected_at. let mut backup = rejected_at.saturating_sub(1); while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { backup -= 1; } let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); let mut err = slow(&slice[backup..upto]).unwrap_err(); err.valid_up_to += backup; err } // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error // when an invalid sequence is found. This is split out from validate so // that the fast path doesn't need to keep track of the position of the // last valid UTF-8 byte. In particular, tracking this requires checking // for an ACCEPT state on each byte, which degrades throughput pretty // badly. fn slow(slice: &[u8]) -> Result<(), Utf8Error> { let mut state = ACCEPT; let mut valid_up_to = 0; for (i, &b) in slice.iter().enumerate() { state = step(state, b); if state == ACCEPT { valid_up_to = i + 1; } else if state == REJECT { // Our error length must always be at least 1. let error_len = Some(cmp::max(1, i - valid_up_to)); return Err(Utf8Error { valid_up_to, error_len }); } } if state != ACCEPT { Err(Utf8Error { valid_up_to, error_len: None }) } else { Ok(()) } } // Advance to the next state given the current state and current byte. fn step(state: usize, b: u8) -> usize { let class = CLASSES[b as usize]; // SAFETY: This is safe because 'class' is always <=11 and 'state' is // always <=96. Therefore, the maximal index is 96+11 = 107, where // STATES_FORWARD.len() = 108 such that every index is guaranteed to be // valid by construction of the state machine and the byte equivalence // classes. unsafe { *STATES_FORWARD.get_unchecked(state + class as usize) as usize } } fast(slice) } /// UTF-8 decode a single Unicode scalar value from the beginning of a slice. /// /// When successful, the corresponding Unicode scalar value is returned along /// with the number of bytes it was encoded with. The number of bytes consumed /// for a successful decode is always between 1 and 4, inclusive. /// /// When unsuccessful, `None` is returned along with the number of bytes that /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, /// the number of bytes consumed is always between 0 and 3, inclusive, where /// 0 is only returned when `slice` is empty. /// /// # Examples /// /// Basic usage: /// /// ``` /// use bstr::decode_utf8; /// /// // Decoding a valid codepoint. /// let (ch, size) = decode_utf8(b"\xE2\x98\x83"); /// assert_eq!(Some('☃'), ch); /// assert_eq!(3, size); /// /// // Decoding an incomplete codepoint. /// let (ch, size) = decode_utf8(b"\xE2\x98"); /// assert_eq!(None, ch); /// assert_eq!(2, size); /// ``` /// /// This example shows how to iterate over all codepoints in UTF-8 encoded /// bytes, while replacing invalid UTF-8 sequences with the replacement /// codepoint: /// /// ``` /// use bstr::{B, decode_utf8}; /// /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_utf8(bytes); /// bytes = &bytes[size..]; /// chars.push(ch.unwrap_or('\u{FFFD}')); /// } /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); /// ``` #[inline] pub fn decode>(slice: B) -> (Option, usize) { let slice = slice.as_ref(); match slice.get(0) { None => return (None, 0), Some(&b) if b <= 0x7F => return (Some(b as char), 1), _ => {} } let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); while i < slice.len() { decode_step(&mut state, &mut cp, slice[i]); i += 1; if state == ACCEPT { // SAFETY: This is safe because `decode_step` guarantees that // `cp` is a valid Unicode scalar value in an ACCEPT state. let ch = unsafe { char::from_u32_unchecked(cp) }; return (Some(ch), i); } else if state == REJECT { // At this point, we always want to advance at least one byte. return (None, cmp::max(1, i.saturating_sub(1))); } } (None, i) } /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a /// slice. /// /// When successful, the corresponding Unicode scalar value is returned along /// with the number of bytes it was encoded with. The number of bytes consumed /// for a successful decode is always between 1 and 4, inclusive. /// /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned /// along with the number of bytes that make up a maximal prefix of a valid /// UTF-8 code unit sequence. In this case, the number of bytes consumed is /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is /// empty. /// /// # Examples /// /// Basic usage: /// /// ```ignore /// use bstr::decode_utf8_lossy; /// /// // Decoding a valid codepoint. /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83"); /// assert_eq!('☃', ch); /// assert_eq!(3, size); /// /// // Decoding an incomplete codepoint. /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98"); /// assert_eq!('\u{FFFD}', ch); /// assert_eq!(2, size); /// ``` /// /// This example shows how to iterate over all codepoints in UTF-8 encoded /// bytes, while replacing invalid UTF-8 sequences with the replacement /// codepoint: /// /// ```ignore /// use bstr::{B, decode_utf8_lossy}; /// /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_utf8_lossy(bytes); /// bytes = &bytes[size..]; /// chars.push(ch); /// } /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); /// ``` #[inline] pub fn decode_lossy>(slice: B) -> (char, usize) { match decode(slice) { (Some(ch), size) => (ch, size), (None, size) => ('\u{FFFD}', size), } } /// UTF-8 decode a single Unicode scalar value from the end of a slice. /// /// When successful, the corresponding Unicode scalar value is returned along /// with the number of bytes it was encoded with. The number of bytes consumed /// for a successful decode is always between 1 and 4, inclusive. /// /// When unsuccessful, `None` is returned along with the number of bytes that /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, /// the number of bytes consumed is always between 0 and 3, inclusive, where /// 0 is only returned when `slice` is empty. /// /// # Examples /// /// Basic usage: /// /// ``` /// use bstr::decode_last_utf8; /// /// // Decoding a valid codepoint. /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83"); /// assert_eq!(Some('☃'), ch); /// assert_eq!(3, size); /// /// // Decoding an incomplete codepoint. /// let (ch, size) = decode_last_utf8(b"\xE2\x98"); /// assert_eq!(None, ch); /// assert_eq!(2, size); /// ``` /// /// This example shows how to iterate over all codepoints in UTF-8 encoded /// bytes in reverse, while replacing invalid UTF-8 sequences with the /// replacement codepoint: /// /// ``` /// use bstr::{B, decode_last_utf8}; /// /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_last_utf8(bytes); /// bytes = &bytes[..bytes.len()-size]; /// chars.push(ch.unwrap_or('\u{FFFD}')); /// } /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); /// ``` #[inline] pub fn decode_last>(slice: B) -> (Option, usize) { // TODO: We could implement this by reversing the UTF-8 automaton, but for // now, we do it the slow way by using the forward automaton. let slice = slice.as_ref(); if slice.is_empty() { return (None, 0); } let mut start = slice.len() - 1; let limit = slice.len().saturating_sub(4); while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) { start -= 1; } let (ch, size) = decode(&slice[start..]); // If we didn't consume all of the bytes, then that means there's at least // one stray byte that never occurs in a valid code unit prefix, so we can // advance by one byte. if start + size != slice.len() { (None, 1) } else { (ch, size) } } /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice. /// /// When successful, the corresponding Unicode scalar value is returned along /// with the number of bytes it was encoded with. The number of bytes consumed /// for a successful decode is always between 1 and 4, inclusive. /// /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned /// along with the number of bytes that make up a maximal prefix of a valid /// UTF-8 code unit sequence. In this case, the number of bytes consumed is /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is /// empty. /// /// # Examples /// /// Basic usage: /// /// ```ignore /// use bstr::decode_last_utf8_lossy; /// /// // Decoding a valid codepoint. /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83"); /// assert_eq!('☃', ch); /// assert_eq!(3, size); /// /// // Decoding an incomplete codepoint. /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98"); /// assert_eq!('\u{FFFD}', ch); /// assert_eq!(2, size); /// ``` /// /// This example shows how to iterate over all codepoints in UTF-8 encoded /// bytes in reverse, while replacing invalid UTF-8 sequences with the /// replacement codepoint: /// /// ```ignore /// use bstr::decode_last_utf8_lossy; /// /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_last_utf8_lossy(bytes); /// bytes = &bytes[..bytes.len()-size]; /// chars.push(ch); /// } /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); /// ``` #[inline] pub fn decode_last_lossy>(slice: B) -> (char, usize) { match decode_last(slice) { (Some(ch), size) => (ch, size), (None, size) => ('\u{FFFD}', size), } } /// SAFETY: The decode function relies on state being equal to ACCEPT only if /// cp is a valid Unicode scalar value. #[inline] pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { let class = CLASSES[b as usize]; if *state == ACCEPT { *cp = (0xFF >> class) & (b as u32); } else { *cp = (b as u32 & 0b111111) | (*cp << 6); } *state = STATES_FORWARD[*state + class as usize] as usize; } /// Returns true if and only if the given byte is either a valid leading UTF-8 /// byte, or is otherwise an invalid byte that can never appear anywhere in a /// valid UTF-8 sequence. fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { // In the ASCII case, the most significant bit is never set. The leading // byte of a 2/3/4-byte sequence always has the top two most significant // bits set. For bytes that can never appear anywhere in valid UTF-8, this // also returns true, since every such byte has its two most significant // bits set: // // \xC0 :: 11000000 // \xC1 :: 11000001 // \xF5 :: 11110101 // \xF6 :: 11110110 // \xF7 :: 11110111 // \xF8 :: 11111000 // \xF9 :: 11111001 // \xFA :: 11111010 // \xFB :: 11111011 // \xFC :: 11111100 // \xFD :: 11111101 // \xFE :: 11111110 // \xFF :: 11111111 (b & 0b1100_0000) != 0b1000_0000 } #[cfg(all(test, feature = "std"))] mod tests { use std::char; use crate::{ ext_slice::{ByteSlice, B}, tests::LOSSY_TESTS, utf8::{self, Utf8Error}, }; fn utf8e(valid_up_to: usize) -> Utf8Error { Utf8Error { valid_up_to, error_len: None } } fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error { Utf8Error { valid_up_to, error_len: Some(error_len) } } #[test] #[cfg(not(miri))] fn validate_all_codepoints() { for i in 0..(0x10FFFF + 1) { let cp = match char::from_u32(i) { None => continue, Some(cp) => cp, }; let mut buf = [0; 4]; let s = cp.encode_utf8(&mut buf); assert_eq!(Ok(()), utf8::validate(s.as_bytes())); } } #[test] fn validate_multiple_codepoints() { assert_eq!(Ok(()), utf8::validate(b"abc")); assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a")); assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a")); assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",)); assert_eq!( Ok(()), utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",) ); assert_eq!( Ok(()), utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",) ); } #[test] fn validate_errors() { // single invalid byte assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF")); // single invalid byte after ASCII assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF")); // single invalid byte after 2 byte sequence assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF")); // single invalid byte after 3 byte sequence assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF")); // single invalid byte after 4 byte sequence assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF")); // An invalid 2-byte sequence with a valid 1-byte prefix. assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0")); // An invalid 3-byte sequence with a valid 2-byte prefix. assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0")); // An invalid 4-byte sequence with a valid 3-byte prefix. assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0")); // An overlong sequence. Should be \xE2\x82\xAC, but we encode the // same codepoint value in 4 bytes. This not only tests that we reject // overlong sequences, but that we get valid_up_to correct. assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC")); assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC")); assert_eq!( Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",) ); // Check that encoding a surrogate codepoint using the UTF-8 scheme // fails validation. assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80")); assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80")); assert_eq!( Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",) ); // Check that an incomplete 2-byte sequence fails. assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa")); assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa")); assert_eq!( Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",) ); // Check that an incomplete 3-byte sequence fails. assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a")); assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a")); assert_eq!( Err(utf8e2(3, 2)), utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",) ); // Check that an incomplete 4-byte sequence fails. assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca")); assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca")); assert_eq!( Err(utf8e2(4, 3)), utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",) ); assert_eq!( Err(utf8e2(6, 3)), utf8::validate(b"foobar\xF1\x80\x80quux",) ); // Check that an incomplete (EOF) 2-byte sequence fails. assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE")); assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE")); assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE")); // Check that an incomplete (EOF) 3-byte sequence fails. assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98")); assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98")); assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98")); // Check that an incomplete (EOF) 4-byte sequence fails. assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C")); assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C")); assert_eq!( Err(utf8e(4)), utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",) ); // Test that we errors correct even after long valid sequences. This // checks that our "backup" logic for detecting errors is correct. assert_eq!( Err(utf8e2(8, 1)), utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",) ); } #[test] fn decode_valid() { fn d(mut s: &str) -> Vec { let mut chars = vec![]; while !s.is_empty() { let (ch, size) = utf8::decode(s.as_bytes()); s = &s[size..]; chars.push(ch.unwrap()); } chars } assert_eq!(vec!['☃'], d("☃")); assert_eq!(vec!['☃', '☃'], d("☃☃")); assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε")); assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇")); assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲")); } #[test] fn decode_invalid() { let (ch, size) = utf8::decode(b""); assert_eq!(None, ch); assert_eq!(0, size); let (ch, size) = utf8::decode(b"\xFF"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode(b"\xCE\xF0"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode(b"\xE2\x98\xF0"); assert_eq!(None, ch); assert_eq!(2, size); let (ch, size) = utf8::decode(b"\xF0\x9D\x9D"); assert_eq!(None, ch); assert_eq!(3, size); let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0"); assert_eq!(None, ch); assert_eq!(3, size); let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode(b"\xED\xA0\x80"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode(b"\xCEa"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode(b"\xE2\x98a"); assert_eq!(None, ch); assert_eq!(2, size); let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca"); assert_eq!(None, ch); assert_eq!(3, size); } #[test] fn decode_lossy() { let (ch, size) = utf8::decode_lossy(b""); assert_eq!('\u{FFFD}', ch); assert_eq!(0, size); let (ch, size) = utf8::decode_lossy(b"\xFF"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_lossy(b"\xCE\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(2, size); let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(3, size); let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_lossy(b"\xCEa"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_lossy(b"\xE2\x98a"); assert_eq!('\u{FFFD}', ch); assert_eq!(2, size); let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca"); assert_eq!('\u{FFFD}', ch); assert_eq!(3, size); } #[test] fn decode_last_valid() { fn d(mut s: &str) -> Vec { let mut chars = vec![]; while !s.is_empty() { let (ch, size) = utf8::decode_last(s.as_bytes()); s = &s[..s.len() - size]; chars.push(ch.unwrap()); } chars } assert_eq!(vec!['☃'], d("☃")); assert_eq!(vec!['☃', '☃'], d("☃☃")); assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε")); assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇")); assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲")); } #[test] fn decode_last_invalid() { let (ch, size) = utf8::decode_last(b""); assert_eq!(None, ch); assert_eq!(0, size); let (ch, size) = utf8::decode_last(b"\xFF"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xCE\xF0"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xCE"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xE2\x98"); assert_eq!(None, ch); assert_eq!(2, size); let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D"); assert_eq!(None, ch); assert_eq!(3, size); let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xED\xA0\x80"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xED\xA0"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"\xED"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"a\xCE"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last(b"a\xE2\x98"); assert_eq!(None, ch); assert_eq!(2, size); let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C"); assert_eq!(None, ch); assert_eq!(3, size); } #[test] fn decode_last_lossy() { let (ch, size) = utf8::decode_last_lossy(b""); assert_eq!('\u{FFFD}', ch); assert_eq!(0, size); let (ch, size) = utf8::decode_last_lossy(b"\xFF"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xCE"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98"); assert_eq!('\u{FFFD}', ch); assert_eq!(2, size); let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D"); assert_eq!('\u{FFFD}', ch); assert_eq!(3, size); let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"\xED"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"a\xCE"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98"); assert_eq!('\u{FFFD}', ch); assert_eq!(2, size); let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C"); assert_eq!('\u{FFFD}', ch); assert_eq!(3, size); } #[test] fn chars() { for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() { let got: String = B(input).chars().collect(); assert_eq!( expected, got, "chars(ith: {:?}, given: {:?})", i, input, ); let got: String = B(input).char_indices().map(|(_, _, ch)| ch).collect(); assert_eq!( expected, got, "char_indices(ith: {:?}, given: {:?})", i, input, ); let expected: String = expected.chars().rev().collect(); let got: String = B(input).chars().rev().collect(); assert_eq!( expected, got, "chars.rev(ith: {:?}, given: {:?})", i, input, ); let got: String = B(input).char_indices().rev().map(|(_, _, ch)| ch).collect(); assert_eq!( expected, got, "char_indices.rev(ith: {:?}, given: {:?})", i, input, ); } } #[test] fn utf8_chunks() { let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" }; assert_eq!( (c.next(), c.next()), ( Some(utf8::Utf8Chunk { valid: "123", invalid: b"\xC0".as_bstr(), incomplete: false, }), None, ) ); let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" }; assert_eq!( (c.next(), c.next(), c.next()), ( Some(utf8::Utf8Chunk { valid: "123", invalid: b"\xFF".as_bstr(), incomplete: false, }), Some(utf8::Utf8Chunk { valid: "", invalid: b"\xFF".as_bstr(), incomplete: false, }), None, ) ); let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" }; assert_eq!( (c.next(), c.next()), ( Some(utf8::Utf8Chunk { valid: "123", invalid: b"\xD0".as_bstr(), incomplete: true, }), None, ) ); let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" }; assert_eq!( (c.next(), c.next(), c.next()), ( Some(utf8::Utf8Chunk { valid: "123", invalid: b"\xD0".as_bstr(), incomplete: false, }), Some(utf8::Utf8Chunk { valid: "456", invalid: b"".as_bstr(), incomplete: false, }), None, ) ); let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" }; assert_eq!( (c.next(), c.next()), ( Some(utf8::Utf8Chunk { valid: "123", invalid: b"\xE2\x98".as_bstr(), incomplete: true, }), None, ) ); let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" }; assert_eq!( (c.next(), c.next()), ( Some(utf8::Utf8Chunk { valid: "123", invalid: b"\xF4\x8F\xBF".as_bstr(), incomplete: true, }), None, ) ); } }