Adding upstream version 0.70.1+ds1.upstream/0.70.1+ds1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:47:55 +0000
commit: 2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4 (patch)
tree: 033cc839730fda84ff08db877037977be94e5e3a /vendor/bstr/src/utf8.rs
parent: Initial commit. (diff)
download: cargo-2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4.tar.xz
cargo-2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4.zip
1 files changed, 1369 insertions, 0 deletions
diff --git a/vendor/bstr/src/utf8.rs b/vendor/bstr/src/utf8.rs
new file mode 100644
index 0000000..4b5bc20
--- /dev/null
+++ b/vendor/bstr/src/utf8.rs
@@ -0,0 +1,1369 @@
+use core::{char, cmp, fmt, str};
+
+#[cfg(feature = "std")]
+use std::error;
+
+use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
+
+// The UTF-8 decoder provided here is based on the one presented here:
+// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+//
+// We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
+// using regex-automata that is roughly the same size. The real benefit of
+// Hoehrmann's formulation is that the byte class mapping below is manually
+// tailored such that each byte's class doubles as a shift to mask out the
+// bits necessary for constructing the leading bits of each codepoint value
+// from the initial byte.
+//
+// There are some minor differences between this implementation and Hoehrmann's
+// formulation.
+//
+// Firstly, we make REJECT have state ID 0, since it makes the state table
+// itself a little easier to read and is consistent with the notion that 0
+// means "false" or "bad."
+//
+// Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
+// path.
+//
+// Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
+// in the core decoding loop. (Which is what regex-automata would do by
+// default.)
+//
+// Fourthly, we split the byte class mapping and transition table into two
+// arrays because it's clearer.
+//
+// It is unlikely that this is the fastest way to do UTF-8 decoding, however,
+// it is fairly simple.
+
+const ACCEPT: usize = 12;
+const REJECT: usize = 0;
+
+/// SAFETY: The decode below function relies on the correctness of these
+/// equivalence classes.
+#[cfg_attr(rustfmt, rustfmt::skip)]
+const CLASSES: [u8; 256] = [
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+];
+
+/// SAFETY: The decode below function relies on the correctness of this state
+/// machine.
+#[cfg_attr(rustfmt, rustfmt::skip)]
+const STATES_FORWARD: &'static [u8] = &[
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
+  0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
+  0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
+  0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
+  0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
+  0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+];
+
+/// An iterator over Unicode scalar values in a byte string.
+///
+/// When invalid UTF-8 byte sequences are found, they are substituted with the
+/// Unicode replacement codepoint (`U+FFFD`) using the
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
+///
+/// This iterator is created by the
+/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
+/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
+#[derive(Clone, Debug)]
+pub struct Chars<'a> {
+    bs: &'a [u8],
+}
+
+impl<'a> Chars<'a> {
+    pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
+        Chars { bs }
+    }
+
+    /// View the underlying data as a subslice of the original data.
+    ///
+    /// The slice returned has the same lifetime as the original slice, and so
+    /// the iterator can continue to be used while this exists.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bstr::ByteSlice;
+    ///
+    /// let mut chars = b"abc".chars();
+    ///
+    /// assert_eq!(b"abc", chars.as_bytes());
+    /// chars.next();
+    /// assert_eq!(b"bc", chars.as_bytes());
+    /// chars.next();
+    /// chars.next();
+    /// assert_eq!(b"", chars.as_bytes());
+    /// ```
+    #[inline]
+    pub fn as_bytes(&self) -> &'a [u8] {
+        self.bs
+    }
+}
+
+impl<'a> Iterator for Chars<'a> {
+    type Item = char;
+
+    #[inline]
+    fn next(&mut self) -> Option<char> {
+        let (ch, size) = decode_lossy(self.bs);
+        if size == 0 {
+            return None;
+        }
+        self.bs = &self.bs[size..];
+        Some(ch)
+    }
+}
+
+impl<'a> DoubleEndedIterator for Chars<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<char> {
+        let (ch, size) = decode_last_lossy(self.bs);
+        if size == 0 {
+            return None;
+        }
+        self.bs = &self.bs[..self.bs.len() - size];
+        Some(ch)
+    }
+}
+
+/// An iterator over Unicode scalar values in a byte string and their
+/// byte index positions.
+///
+/// When invalid UTF-8 byte sequences are found, they are substituted with the
+/// Unicode replacement codepoint (`U+FFFD`) using the
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
+///
+/// Note that this is slightly different from the `CharIndices` iterator
+/// provided by the standard library. Aside from working on possibly invalid
+/// UTF-8, this iterator provides both the corresponding starting and ending
+/// byte indices of each codepoint yielded. The ending position is necessary to
+/// slice the original byte string when invalid UTF-8 bytes are converted into
+/// a Unicode replacement codepoint, since a single replacement codepoint can
+/// substitute anywhere from 1 to 3 invalid bytes (inclusive).
+///
+/// This iterator is created by the
+/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
+/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
+#[derive(Clone, Debug)]
+pub struct CharIndices<'a> {
+    bs: &'a [u8],
+    forward_index: usize,
+    reverse_index: usize,
+}
+
+impl<'a> CharIndices<'a> {
+    pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
+        CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
+    }
+
+    /// View the underlying data as a subslice of the original data.
+    ///
+    /// The slice returned has the same lifetime as the original slice, and so
+    /// the iterator can continue to be used while this exists.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bstr::ByteSlice;
+    ///
+    /// let mut it = b"abc".char_indices();
+    ///
+    /// assert_eq!(b"abc", it.as_bytes());
+    /// it.next();
+    /// assert_eq!(b"bc", it.as_bytes());
+    /// it.next();
+    /// it.next();
+    /// assert_eq!(b"", it.as_bytes());
+    /// ```
+    #[inline]
+    pub fn as_bytes(&self) -> &'a [u8] {
+        self.bs
+    }
+}
+
+impl<'a> Iterator for CharIndices<'a> {
+    type Item = (usize, usize, char);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, usize, char)> {
+        let index = self.forward_index;
+        let (ch, size) = decode_lossy(self.bs);
+        if size == 0 {
+            return None;
+        }
+        self.bs = &self.bs[size..];
+        self.forward_index += size;
+        Some((index, index + size, ch))
+    }
+}
+
+impl<'a> DoubleEndedIterator for CharIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(usize, usize, char)> {
+        let (ch, size) = decode_last_lossy(self.bs);
+        if size == 0 {
+            return None;
+        }
+        self.bs = &self.bs[..self.bs.len() - size];
+        self.reverse_index -= size;
+        Some((self.reverse_index, self.reverse_index + size, ch))
+    }
+}
+
+impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
+
+/// An iterator over chunks of valid UTF-8 in a byte slice.
+///
+/// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
+#[derive(Clone, Debug)]
+pub struct Utf8Chunks<'a> {
+    pub(super) bytes: &'a [u8],
+}
+
+/// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
+///
+/// This is yielded by the
+/// [`Utf8Chunks`](struct.Utf8Chunks.html)
+/// iterator, which can be created via the
+/// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
+/// method.
+///
+/// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
+/// are being iterated over.
+#[cfg_attr(test, derive(Debug, PartialEq))]
+pub struct Utf8Chunk<'a> {
+    /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
+    ///
+    /// This is empty between adjacent invalid UTF-8 byte sequences.
+    valid: &'a str,
+    /// A sequence of invalid UTF-8 bytes.
+    ///
+    /// Can only be empty in the last chunk.
+    ///
+    /// Should be replaced by a single unicode replacement character, if not
+    /// empty.
+    invalid: &'a BStr,
+    /// Indicates whether the invalid sequence could've been valid if there
+    /// were more bytes.
+    ///
+    /// Can only be true in the last chunk.
+    incomplete: bool,
+}
+
+impl<'a> Utf8Chunk<'a> {
+    /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
+    ///
+    /// This may be empty if there are consecutive sequences of invalid UTF-8
+    /// bytes.
+    #[inline]
+    pub fn valid(&self) -> &'a str {
+        self.valid
+    }
+
+    /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
+    /// immediately follow the valid UTF-8 bytes in this chunk.
+    ///
+    /// This is only empty when this chunk corresponds to the last chunk in
+    /// the original bytes.
+    ///
+    /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
+    /// sequences greater than 1 always correspond to a valid _prefix_ of
+    /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
+    /// of maximal subparts" strategy that is described in more detail in the
+    /// docs for the
+    /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
+    /// method.
+    #[inline]
+    pub fn invalid(&self) -> &'a [u8] {
+        self.invalid.as_bytes()
+    }
+
+    /// Returns whether the invalid sequence might still become valid if more
+    /// bytes are added.
+    ///
+    /// Returns true if the end of the input was reached unexpectedly,
+    /// without encountering an unexpected byte.
+    ///
+    /// This can only be the case for the last chunk.
+    #[inline]
+    pub fn incomplete(&self) -> bool {
+        self.incomplete
+    }
+}
+
+impl<'a> Iterator for Utf8Chunks<'a> {
+    type Item = Utf8Chunk<'a>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
+        if self.bytes.is_empty() {
+            return None;
+        }
+        match validate(self.bytes) {
+            Ok(()) => {
+                let valid = self.bytes;
+                self.bytes = &[];
+                Some(Utf8Chunk {
+                    // SAFETY: This is safe because of the guarantees provided
+                    // by utf8::validate.
+                    valid: unsafe { str::from_utf8_unchecked(valid) },
+                    invalid: [].as_bstr(),
+                    incomplete: false,
+                })
+            }
+            Err(e) => {
+                let (valid, rest) = self.bytes.split_at(e.valid_up_to());
+                // SAFETY: This is safe because of the guarantees provided by
+                // utf8::validate.
+                let valid = unsafe { str::from_utf8_unchecked(valid) };
+                let (invalid_len, incomplete) = match e.error_len() {
+                    Some(n) => (n, false),
+                    None => (rest.len(), true),
+                };
+                let (invalid, rest) = rest.split_at(invalid_len);
+                self.bytes = rest;
+                Some(Utf8Chunk {
+                    valid,
+                    invalid: invalid.as_bstr(),
+                    incomplete,
+                })
+            }
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        if self.bytes.is_empty() {
+            (0, Some(0))
+        } else {
+            (1, Some(self.bytes.len()))
+        }
+    }
+}
+
+impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
+
+/// An error that occurs when UTF-8 decoding fails.
+///
+/// This error occurs when attempting to convert a non-UTF-8 byte
+/// string to a Rust string that must be valid UTF-8. For example,
+/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
+///
+/// # Example
+///
+/// This example shows what happens when a given byte sequence is invalid,
+/// but ends with a sequence that is a possible prefix of valid UTF-8.
+///
+/// ```
+/// use bstr::{B, ByteSlice};
+///
+/// let s = B(b"foobar\xF1\x80\x80");
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// assert_eq!(err.error_len(), None);
+/// ```
+///
+/// This example shows what happens when a given byte sequence contains
+/// invalid UTF-8.
+///
+/// ```
+/// use bstr::ByteSlice;
+///
+/// let s = b"foobar\xF1\x80\x80quux";
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// // The error length reports the maximum number of bytes that correspond to
+/// // a valid prefix of a UTF-8 encoded codepoint.
+/// assert_eq!(err.error_len(), Some(3));
+///
+/// // In contrast to the above which contains a single invalid prefix,
+/// // consider the case of multiple individual bytes that are never valid
+/// // prefixes. Note how the value of error_len changes!
+/// let s = b"foobar\xFF\xFFquux";
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// assert_eq!(err.error_len(), Some(1));
+///
+/// // The fact that it's an invalid prefix does not change error_len even
+/// // when it immediately precedes the end of the string.
+/// let s = b"foobar\xFF";
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// assert_eq!(err.error_len(), Some(1));
+/// ```
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Utf8Error {
+    valid_up_to: usize,
+    error_len: Option<usize>,
+}
+
+impl Utf8Error {
+    /// Returns the byte index of the position immediately following the last
+    /// valid UTF-8 byte.
+    ///
+    /// # Example
+    ///
+    /// This examples shows how `valid_up_to` can be used to retrieve a
+    /// possibly empty prefix that is guaranteed to be valid UTF-8:
+    ///
+    /// ```
+    /// use bstr::ByteSlice;
+    ///
+    /// let s = b"foobar\xF1\x80\x80quux";
+    /// let err = s.to_str().unwrap_err();
+    ///
+    /// // This is guaranteed to never panic.
+    /// let string = s[..err.valid_up_to()].to_str().unwrap();
+    /// assert_eq!(string, "foobar");
+    /// ```
+    #[inline]
+    pub fn valid_up_to(&self) -> usize {
+        self.valid_up_to
+    }
+
+    /// Returns the total number of invalid UTF-8 bytes immediately following
+    /// the position returned by `valid_up_to`. This value is always at least
+    /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
+    /// encoded codepoint.
+    ///
+    /// If the end of the original input was found before a valid UTF-8 encoded
+    /// codepoint could be completed, then this returns `None`. This is useful
+    /// when processing streams, where a `None` value signals that more input
+    /// might be needed.
+    #[inline]
+    pub fn error_len(&self) -> Option<usize> {
+        self.error_len
+    }
+}
+
+#[cfg(feature = "std")]
+impl error::Error for Utf8Error {
+    fn description(&self) -> &str {
+        "invalid UTF-8"
+    }
+}
+
+impl fmt::Display for Utf8Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
+    }
+}
+
+/// Returns OK if and only if the given slice is completely valid UTF-8.
+///
+/// If the slice isn't valid UTF-8, then an error is returned that explains
+/// the first location at which invalid UTF-8 was detected.
+pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
+    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
+    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
+    // detected, it backs up and runs the slower version of the UTF-8 automaton
+    // to determine correct error information.
+    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
+        let mut state = ACCEPT;
+        let mut i = 0;
+
+        while i < slice.len() {
+            let b = slice[i];
+
+            // ASCII fast path. If we see two consecutive ASCII bytes, then try
+            // to validate as much ASCII as possible very quickly.
+            if state == ACCEPT
+                && b <= 0x7F
+                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
+            {
+                i += ascii::first_non_ascii_byte(&slice[i..]);
+                continue;
+            }
+
+            state = step(state, b);
+            if state == REJECT {
+                return Err(find_valid_up_to(slice, i));
+            }
+            i += 1;
+        }
+        if state != ACCEPT {
+            Err(find_valid_up_to(slice, slice.len()))
+        } else {
+            Ok(())
+        }
+    }
+
+    // Given the first position at which a UTF-8 sequence was determined to be
+    // invalid, return an error that correctly reports the position at which
+    // the last complete UTF-8 sequence ends.
+    #[inline(never)]
+    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
+        // In order to find the last valid byte, we need to back up an amount
+        // that guarantees every preceding byte is part of a valid UTF-8
+        // code unit sequence. To do this, we simply locate the last leading
+        // byte that occurs before rejected_at.
+        let mut backup = rejected_at.saturating_sub(1);
+        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
+            backup -= 1;
+        }
+        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
+        let mut err = slow(&slice[backup..upto]).unwrap_err();
+        err.valid_up_to += backup;
+        err
+    }
+
+    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
+    // when an invalid sequence is found. This is split out from validate so
+    // that the fast path doesn't need to keep track of the position of the
+    // last valid UTF-8 byte. In particular, tracking this requires checking
+    // for an ACCEPT state on each byte, which degrades throughput pretty
+    // badly.
+    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
+        let mut state = ACCEPT;
+        let mut valid_up_to = 0;
+        for (i, &b) in slice.iter().enumerate() {
+            state = step(state, b);
+            if state == ACCEPT {
+                valid_up_to = i + 1;
+            } else if state == REJECT {
+                // Our error length must always be at least 1.
+                let error_len = Some(cmp::max(1, i - valid_up_to));
+                return Err(Utf8Error { valid_up_to, error_len });
+            }
+        }
+        if state != ACCEPT {
+            Err(Utf8Error { valid_up_to, error_len: None })
+        } else {
+            Ok(())
+        }
+    }
+
+    // Advance to the next state given the current state and current byte.
+    fn step(state: usize, b: u8) -> usize {
+        let class = CLASSES[b as usize];
+        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
+        // always <=96. Therefore, the maximal index is 96+11 = 107, where
+        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
+        // valid by construction of the state machine and the byte equivalence
+        // classes.
+        unsafe {
+            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
+        }
+    }
+
+    fast(slice)
+}
+
+/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, `None` is returned along with the number of bytes that
+/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
+/// the number of bytes consumed is always between 0 and 3, inclusive, where
+/// 0 is only returned when `slice` is empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use bstr::decode_utf8;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
+/// assert_eq!(Some('☃'), ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_utf8(b"\xE2\x98");
+/// assert_eq!(None, ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes, while replacing invalid UTF-8 sequences with the replacement
+/// codepoint:
+///
+/// ```
+/// use bstr::{B, decode_utf8};
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+///     let (ch, size) = decode_utf8(bytes);
+///     bytes = &bytes[size..];
+///     chars.push(ch.unwrap_or('\u{FFFD}'));
+/// }
+/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
+/// ```
+#[inline]
+pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
+    let slice = slice.as_ref();
+    match slice.get(0) {
+        None => return (None, 0),
+        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
+        _ => {}
+    }
+
+    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
+    while i < slice.len() {
+        decode_step(&mut state, &mut cp, slice[i]);
+        i += 1;
+
+        if state == ACCEPT {
+            // SAFETY: This is safe because `decode_step` guarantees that
+            // `cp` is a valid Unicode scalar value in an ACCEPT state.
+            let ch = unsafe { char::from_u32_unchecked(cp) };
+            return (Some(ch), i);
+        } else if state == REJECT {
+            // At this point, we always want to advance at least one byte.
+            return (None, cmp::max(1, i.saturating_sub(1)));
+        }
+    }
+    (None, i)
+}
+
+/// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
+/// slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
+/// along with the number of bytes that make up a maximal prefix of a valid
+/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
+/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
+/// empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```ignore
+/// use bstr::decode_utf8_lossy;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
+/// assert_eq!('☃', ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
+/// assert_eq!('\u{FFFD}', ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes, while replacing invalid UTF-8 sequences with the replacement
+/// codepoint:
+///
+/// ```ignore
+/// use bstr::{B, decode_utf8_lossy};
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+///     let (ch, size) = decode_utf8_lossy(bytes);
+///     bytes = &bytes[size..];
+///     chars.push(ch);
+/// }
+/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
+/// ```
+#[inline]
+pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
+    match decode(slice) {
+        (Some(ch), size) => (ch, size),
+        (None, size) => ('\u{FFFD}', size),
+    }
+}
+
+/// UTF-8 decode a single Unicode scalar value from the end of a slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, `None` is returned along with the number of bytes that
+/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
+/// the number of bytes consumed is always between 0 and 3, inclusive, where
+/// 0 is only returned when `slice` is empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use bstr::decode_last_utf8;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
+/// assert_eq!(Some('☃'), ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_last_utf8(b"\xE2\x98");
+/// assert_eq!(None, ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes in reverse, while replacing invalid UTF-8 sequences with the
+/// replacement codepoint:
+///
+/// ```
+/// use bstr::{B, decode_last_utf8};
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+///     let (ch, size) = decode_last_utf8(bytes);
+///     bytes = &bytes[..bytes.len()-size];
+///     chars.push(ch.unwrap_or('\u{FFFD}'));
+/// }
+/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
+/// ```
+#[inline]
+pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
+    // TODO: We could implement this by reversing the UTF-8 automaton, but for
+    // now, we do it the slow way by using the forward automaton.
+
+    let slice = slice.as_ref();
+    if slice.is_empty() {
+        return (None, 0);
+    }
+    let mut start = slice.len() - 1;
+    let limit = slice.len().saturating_sub(4);
+    while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
+        start -= 1;
+    }
+    let (ch, size) = decode(&slice[start..]);
+    // If we didn't consume all of the bytes, then that means there's at least
+    // one stray byte that never occurs in a valid code unit prefix, so we can
+    // advance by one byte.
+    if start + size != slice.len() {
+        (None, 1)
+    } else {
+        (ch, size)
+    }
+}
+
+/// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
+/// along with the number of bytes that make up a maximal prefix of a valid
+/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
+/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
+/// empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```ignore
+/// use bstr::decode_last_utf8_lossy;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
+/// assert_eq!('☃', ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
+/// assert_eq!('\u{FFFD}', ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes in reverse, while replacing invalid UTF-8 sequences with the
+/// replacement codepoint:
+///
+/// ```ignore
+/// use bstr::decode_last_utf8_lossy;
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+///     let (ch, size) = decode_last_utf8_lossy(bytes);
+///     bytes = &bytes[..bytes.len()-size];
+///     chars.push(ch);
+/// }
+/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
+/// ```
+#[inline]
+pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
+    match decode_last(slice) {
+        (Some(ch), size) => (ch, size),
+        (None, size) => ('\u{FFFD}', size),
+    }
+}
+
+/// SAFETY: The decode function relies on state being equal to ACCEPT only if
+/// cp is a valid Unicode scalar value.
+#[inline]
+pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
+    let class = CLASSES[b as usize];
+    if *state == ACCEPT {
+        *cp = (0xFF >> class) & (b as u32);
+    } else {
+        *cp = (b as u32 & 0b111111) | (*cp << 6);
+    }
+    *state = STATES_FORWARD[*state + class as usize] as usize;
+}
+
+/// Returns true if and only if the given byte is either a valid leading UTF-8
+/// byte, or is otherwise an invalid byte that can never appear anywhere in a
+/// valid UTF-8 sequence.
+fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
+    // In the ASCII case, the most significant bit is never set. The leading
+    // byte of a 2/3/4-byte sequence always has the top two most significant
+    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
+    // also returns true, since every such byte has its two most significant
+    // bits set:
+    //
+    //     \xC0 :: 11000000
+    //     \xC1 :: 11000001
+    //     \xF5 :: 11110101
+    //     \xF6 :: 11110110
+    //     \xF7 :: 11110111
+    //     \xF8 :: 11111000
+    //     \xF9 :: 11111001
+    //     \xFA :: 11111010
+    //     \xFB :: 11111011
+    //     \xFC :: 11111100
+    //     \xFD :: 11111101
+    //     \xFE :: 11111110
+    //     \xFF :: 11111111
+    (b & 0b1100_0000) != 0b1000_0000
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+    use std::char;
+
+    use crate::{
+        ext_slice::{ByteSlice, B},
+        tests::LOSSY_TESTS,
+        utf8::{self, Utf8Error},
+    };
+
+    fn utf8e(valid_up_to: usize) -> Utf8Error {
+        Utf8Error { valid_up_to, error_len: None }
+    }
+
+    fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
+        Utf8Error { valid_up_to, error_len: Some(error_len) }
+    }
+
+    #[test]
+    #[cfg(not(miri))]
+    fn validate_all_codepoints() {
+        for i in 0..(0x10FFFF + 1) {
+            let cp = match char::from_u32(i) {
+                None => continue,
+                Some(cp) => cp,
+            };
+            let mut buf = [0; 4];
+            let s = cp.encode_utf8(&mut buf);
+            assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
+        }
+    }
+
+    #[test]
+    fn validate_multiple_codepoints() {
+        assert_eq!(Ok(()), utf8::validate(b"abc"));
+        assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
+        assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
+        assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
+        assert_eq!(
+            Ok(()),
+            utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
+        );
+        assert_eq!(
+            Ok(()),
+            utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
+        );
+    }
+
+    #[test]
+    fn validate_errors() {
+        // single invalid byte
+        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
+        // single invalid byte after ASCII
+        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
+        // single invalid byte after 2 byte sequence
+        assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
+        // single invalid byte after 3 byte sequence
+        assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
+        // single invalid byte after 4 byte sequence
+        assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
+
+        // An invalid 2-byte sequence with a valid 1-byte prefix.
+        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
+        // An invalid 3-byte sequence with a valid 2-byte prefix.
+        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
+        // An invalid 4-byte sequence with a valid 3-byte prefix.
+        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
+
+        // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
+        // same codepoint value in 4 bytes. This not only tests that we reject
+        // overlong sequences, but that we get valid_up_to correct.
+        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
+        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
+        assert_eq!(
+            Err(utf8e2(3, 1)),
+            utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
+        );
+
+        // Check that encoding a surrogate codepoint using the UTF-8 scheme
+        // fails validation.
+        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
+        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
+        assert_eq!(
+            Err(utf8e2(3, 1)),
+            utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
+        );
+
+        // Check that an incomplete 2-byte sequence fails.
+        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
+        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
+        assert_eq!(
+            Err(utf8e2(3, 1)),
+            utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
+        );
+        // Check that an incomplete 3-byte sequence fails.
+        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
+        assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
+        assert_eq!(
+            Err(utf8e2(3, 2)),
+            utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
+        );
+        // Check that an incomplete 4-byte sequence fails.
+        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
+        assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
+        assert_eq!(
+            Err(utf8e2(4, 3)),
+            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
+        );
+        assert_eq!(
+            Err(utf8e2(6, 3)),
+            utf8::validate(b"foobar\xF1\x80\x80quux",)
+        );
+
+        // Check that an incomplete (EOF) 2-byte sequence fails.
+        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
+        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
+        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
+        // Check that an incomplete (EOF) 3-byte sequence fails.
+        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
+        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
+        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
+        // Check that an incomplete (EOF) 4-byte sequence fails.
+        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
+        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
+        assert_eq!(
+            Err(utf8e(4)),
+            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
+        );
+
+        // Test that we errors correct even after long valid sequences. This
+        // checks that our "backup" logic for detecting errors is correct.
+        assert_eq!(
+            Err(utf8e2(8, 1)),
+            utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
+        );
+    }
+
+    #[test]
+    fn decode_valid() {
+        fn d(mut s: &str) -> Vec<char> {
+            let mut chars = vec![];
+            while !s.is_empty() {
+                let (ch, size) = utf8::decode(s.as_bytes());
+                s = &s[size..];
+                chars.push(ch.unwrap());
+            }
+            chars
+        }
+
+        assert_eq!(vec!['☃'], d("☃"));
+        assert_eq!(vec!['☃', '☃'], d("☃☃"));
+        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
+        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
+        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
+    }
+
+    #[test]
+    fn decode_invalid() {
+        let (ch, size) = utf8::decode(b"");
+        assert_eq!(None, ch);
+        assert_eq!(0, size);
+
+        let (ch, size) = utf8::decode(b"\xFF");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode(b"\xCE\xF0");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
+        assert_eq!(None, ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
+        assert_eq!(None, ch);
+        assert_eq!(3, size);
+
+        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
+        assert_eq!(None, ch);
+        assert_eq!(3, size);
+
+        let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode(b"\xED\xA0\x80");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode(b"\xCEa");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode(b"\xE2\x98a");
+        assert_eq!(None, ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
+        assert_eq!(None, ch);
+        assert_eq!(3, size);
+    }
+
+    #[test]
+    fn decode_lossy() {
+        let (ch, size) = utf8::decode_lossy(b"");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(0, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xFF");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(3, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xCEa");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(3, size);
+    }
+
+    #[test]
+    fn decode_last_valid() {
+        fn d(mut s: &str) -> Vec<char> {
+            let mut chars = vec![];
+            while !s.is_empty() {
+                let (ch, size) = utf8::decode_last(s.as_bytes());
+                s = &s[..s.len() - size];
+                chars.push(ch.unwrap());
+            }
+            chars
+        }
+
+        assert_eq!(vec!['☃'], d("☃"));
+        assert_eq!(vec!['☃', '☃'], d("☃☃"));
+        assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
+        assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
+        assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
+    }
+
+    #[test]
+    fn decode_last_invalid() {
+        let (ch, size) = utf8::decode_last(b"");
+        assert_eq!(None, ch);
+        assert_eq!(0, size);
+
+        let (ch, size) = utf8::decode_last(b"\xFF");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xCE\xF0");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xCE");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xE2\x98");
+        assert_eq!(None, ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
+        assert_eq!(None, ch);
+        assert_eq!(3, size);
+
+        let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xED\xA0");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"\xED");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"a\xCE");
+        assert_eq!(None, ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last(b"a\xE2\x98");
+        assert_eq!(None, ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
+        assert_eq!(None, ch);
+        assert_eq!(3, size);
+    }
+
+    #[test]
+    fn decode_last_lossy() {
+        let (ch, size) = utf8::decode_last_lossy(b"");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(0, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xFF");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xCE");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(3, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"\xED");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(1, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(2, size);
+
+        let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
+        assert_eq!('\u{FFFD}', ch);
+        assert_eq!(3, size);
+    }
+
+    #[test]
+    fn chars() {
+        for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
+            let got: String = B(input).chars().collect();
+            assert_eq!(
+                expected, got,
+                "chars(ith: {:?}, given: {:?})",
+                i, input,
+            );
+            let got: String =
+                B(input).char_indices().map(|(_, _, ch)| ch).collect();
+            assert_eq!(
+                expected, got,
+                "char_indices(ith: {:?}, given: {:?})",
+                i, input,
+            );
+
+            let expected: String = expected.chars().rev().collect();
+
+            let got: String = B(input).chars().rev().collect();
+            assert_eq!(
+                expected, got,
+                "chars.rev(ith: {:?}, given: {:?})",
+                i, input,
+            );
+            let got: String =
+                B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
+            assert_eq!(
+                expected, got,
+                "char_indices.rev(ith: {:?}, given: {:?})",
+                i, input,
+            );
+        }
+    }
+
+    #[test]
+    fn utf8_chunks() {
+        let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
+        assert_eq!(
+            (c.next(), c.next()),
+            (
+                Some(utf8::Utf8Chunk {
+                    valid: "123",
+                    invalid: b"\xC0".as_bstr(),
+                    incomplete: false,
+                }),
+                None,
+            )
+        );
+
+        let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
+        assert_eq!(
+            (c.next(), c.next(), c.next()),
+            (
+                Some(utf8::Utf8Chunk {
+                    valid: "123",
+                    invalid: b"\xFF".as_bstr(),
+                    incomplete: false,
+                }),
+                Some(utf8::Utf8Chunk {
+                    valid: "",
+                    invalid: b"\xFF".as_bstr(),
+                    incomplete: false,
+                }),
+                None,
+            )
+        );
+
+        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
+        assert_eq!(
+            (c.next(), c.next()),
+            (
+                Some(utf8::Utf8Chunk {
+                    valid: "123",
+                    invalid: b"\xD0".as_bstr(),
+                    incomplete: true,
+                }),
+                None,
+            )
+        );
+
+        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
+        assert_eq!(
+            (c.next(), c.next(), c.next()),
+            (
+                Some(utf8::Utf8Chunk {
+                    valid: "123",
+                    invalid: b"\xD0".as_bstr(),
+                    incomplete: false,
+                }),
+                Some(utf8::Utf8Chunk {
+                    valid: "456",
+                    invalid: b"".as_bstr(),
+                    incomplete: false,
+                }),
+                None,
+            )
+        );
+
+        let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
+        assert_eq!(
+            (c.next(), c.next()),
+            (
+                Some(utf8::Utf8Chunk {
+                    valid: "123",
+                    invalid: b"\xE2\x98".as_bstr(),
+                    incomplete: true,
+                }),
+                None,
+            )
+        );
+
+        let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
+        assert_eq!(
+            (c.next(), c.next()),
+            (
+                Some(utf8::Utf8Chunk {
+                    valid: "123",
+                    invalid: b"\xF4\x8F\xBF".as_bstr(),
+                    incomplete: true,
+                }),
+                None,
+            )
+        );
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:47:55 +0000
commit	2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4 (patch)
tree	033cc839730fda84ff08db877037977be94e5e3a /vendor/bstr/src/utf8.rs
parent	Initial commit. (diff)
download	cargo-2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4.tar.xz cargo-2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4.zip