diff options
Diffstat (limited to 'vendor/byteyarn/src/utf8.rs')
-rw-r--r-- | vendor/byteyarn/src/utf8.rs | 151 |
1 files changed, 0 insertions, 151 deletions
diff --git a/vendor/byteyarn/src/utf8.rs b/vendor/byteyarn/src/utf8.rs deleted file mode 100644 index a0006807e..000000000 --- a/vendor/byteyarn/src/utf8.rs +++ /dev/null @@ -1,151 +0,0 @@ -//! UTF-8 utilities not provided by the standard library. - -use std::str; - -#[cfg(doc)] -use crate::*; - -/// An iterator over UTF-8 chunks in a byte buffer. -/// -/// Any time non-UTF-8 bytes are encountered, they are returned as `Err`s -/// from the iterator. -/// -/// See [`Yarn::utf8_chunks()`]. -#[derive(Copy, Clone)] -pub struct Utf8Chunks<'a> { - buf: &'a [u8], - invalid_prefix: Option<usize>, -} - -impl<'a> Utf8Chunks<'a> { - /// Returns the rest of the underlying byte buffer that has not been yielded. - pub fn rest(self) -> &'a [u8] { - self.buf - } - - pub(crate) fn new(buf: &'a [u8]) -> Self { - Self { - buf, - invalid_prefix: None, - } - } - - unsafe fn take(&mut self, len: usize) -> &'a [u8] { - debug_assert!(len <= self.buf.len()); - - let pre = self.buf.get_unchecked(..len); - self.buf = self.buf.get_unchecked(len..); - pre - } -} - -impl<'a> Iterator for Utf8Chunks<'a> { - type Item = Result<&'a str, &'a [u8]>; - - fn next(&mut self) -> Option<Self::Item> { - if let Some(prefix) = self.invalid_prefix.take() { - let bytes = unsafe { - // SAFETY: self.invalid_prefix is only ever written to in this function, - // where it gets set to a value that is known to be in-range. - self.take(prefix) - }; - - return Some(Err(bytes)); - } - - if self.buf.is_empty() { - return None; - } - - let utf8 = match str::from_utf8(self.buf) { - Ok(utf8) => { - self.buf = &[]; - utf8 - } - Err(e) => { - let bytes = unsafe { - // SAFETY: valid_up_to() always returns a value in range of self.buf. - self.take(e.valid_up_to()) - }; - - let utf8 = match cfg!(debug_assertions) { - true => str::from_utf8(bytes).unwrap(), - - // SAFETY: the value of valid_up_to() delimits valid UTF-8, by - // definition. - false => unsafe { str::from_utf8_unchecked(bytes) }, - }; - - self.invalid_prefix = match e.error_len() { - Some(len) => Some(len), - None => Some(self.buf.len()), - }; - - if utf8.is_empty() { - return self.next(); - } - - utf8 - } - }; - - Some(Ok(utf8)) - } -} - -/// `const`-enabled UTF-8 encoding. -/// -/// Returns the encoded bytes in a static array, and the number of those bytes -/// that are pertinent. -pub const fn encode_utf8(c: char) -> ([u8; 4], usize) { - const CONT: u8 = 0b1000_0000; - const CONT_MASK: u8 = !CONT >> 1; - - const B1: u8 = 0b0000_0000; - const B1_MASK: u8 = !B1 >> 1; - - const B2: u8 = 0b1100_0000; - const B2_MASK: u8 = !B2 >> 1; - - const B3: u8 = 0b1110_0000; - const B3_MASK: u8 = !B3 >> 1; - - const B4: u8 = 0b1111_0000; - const B4_MASK: u8 = !B4 >> 1; - - const fn sextet(c: char, idx: u32) -> u8 { - ((c as u32) >> (idx * 6)) as u8 - } - - match c.len_utf8() { - 1 => ([sextet(c, 0) & B1_MASK | B1, 0, 0, 0], 1), - 2 => ( - [ - sextet(c, 1) & B2_MASK | B2, - sextet(c, 0) & CONT_MASK | CONT, - 0, - 0, - ], - 2, - ), - 3 => ( - [ - sextet(c, 2) & B3_MASK | B3, - sextet(c, 1) & CONT_MASK | CONT, - sextet(c, 0) & CONT_MASK | CONT, - 0, - ], - 3, - ), - 4 => ( - [ - sextet(c, 3) & B4_MASK | B4, - sextet(c, 2) & CONT_MASK | CONT, - sextet(c, 1) & CONT_MASK | CONT, - sextet(c, 0) & CONT_MASK | CONT, - ], - 4, - ), - _ => unreachable!(), - } -} |