// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #![cfg_attr(test, feature(test))] #[macro_use] extern crate debug_unreachable; #[macro_use] extern crate mac; #[cfg(test)] extern crate test as std_test; use std::{slice, char}; /// Meaning of a complete or partial UTF-8 codepoint. /// /// Not all checking is performed eagerly. That is, a codepoint `Prefix` or /// `Suffix` may in reality have no valid completion. #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] pub enum Meaning { /// We found a whole codepoint. Whole(char), /// We found something that isn't a valid Unicode codepoint, but /// it *would* correspond to a UTF-16 leading surrogate code unit, /// i.e. a value in the range `U+D800` - `U+DBFF`. /// /// The argument is the code unit's 10-bit index within that range. /// /// These are found in UTF-8 variants such as CESU-8 and WTF-8. LeadSurrogate(u16), /// We found something that isn't a valid Unicode codepoint, but /// it *would* correspond to a UTF-16 trailing surrogate code unit, /// i.e. a value in the range `U+DC00` - `U+DFFF`. /// /// The argument is the code unit's 10-bit index within that range. /// /// These are found in UTF-8 variants such as CESU-8 and WTF-8. TrailSurrogate(u16), /// We found only a prefix of a codepoint before the buffer ended. /// /// Includes the number of additional bytes needed. Prefix(usize), /// We found only a suffix of a codepoint before running off the /// start of the buffer. /// /// Up to 3 more bytes may be needed. Suffix, } /// Represents a complete or partial UTF-8 codepoint. #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] pub struct Codepoint<'a> { /// The bytes that make up the partial or full codepoint. /// /// For a `Suffix` this depends on `idx`. We don't scan forward /// for additional continuation bytes after the reverse scan /// failed to locate a multibyte sequence start. pub bytes: &'a [u8], /// Start of the codepoint in the buffer, expressed as an offset /// back from `idx`. pub rewind: usize, /// Meaning of the partial or full codepoint. pub meaning: Meaning, } #[derive(Debug, PartialEq, Eq)] enum Byte { Ascii, Start(usize), Cont, } impl Byte { #[inline(always)] fn classify(x: u8) -> Option { match x & 0xC0 { 0xC0 => match x { x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), _ => None, }, 0x80 => Some(Byte::Cont), _ => Some(Byte::Ascii), } } } #[inline(always)] fn all_cont(buf: &[u8]) -> bool { buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont))) } // NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: // a starting byte followed by the correct number of continuation bytes. #[inline(always)] unsafe fn decode(buf: &[u8]) -> Option { debug_assert!(buf.len() >= 2); debug_assert!(buf.len() <= 4); let n; match buf.len() { 2 => { n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 | ((*buf.get_unchecked(1) & 0x3F) as u32); if n < 0x80 { return None } // Overlong } 3 => { n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 | ((*buf.get_unchecked(2) & 0x3F) as u32); match n { 0x0000 ... 0x07FF => return None, // Overlong 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), _ => {} } } 4 => { n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 | ((*buf.get_unchecked(3) & 0x3F) as u32); if n < 0x1_0000 { return None } // Overlong } _ => debug_unreachable!(), } char::from_u32(n).map(Meaning::Whole) } #[inline(always)] unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { debug_assert!(start <= buf.len()); debug_assert!(new_len <= (buf.len() - start)); slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) } macro_rules! otry { ($x:expr) => { unwrap_or_return!($x, None) } } /// Describes the UTF-8 codepoint containing the byte at index `idx` within /// `buf`. /// /// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 /// in the vicinity of `idx`. #[inline] pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { if idx >= buf.len() { return None; } unsafe { let x = *buf.get_unchecked(idx); match otry!(Byte::classify(x)) { Byte::Ascii => Some(Codepoint { bytes: unsafe_slice(buf, idx, 1), rewind: 0, meaning: Meaning::Whole(x as char), }), Byte::Start(n) => { let avail = buf.len() - idx; if avail >= n { let bytes = unsafe_slice(buf, idx, n); if !all_cont(unsafe_slice(bytes, 1, n-1)) { return None; } let meaning = otry!(decode(bytes)); Some(Codepoint { bytes: bytes, rewind: 0, meaning: meaning, }) } else { Some(Codepoint { bytes: unsafe_slice(buf, idx, avail), rewind: 0, meaning: Meaning::Prefix(n - avail), }) } }, Byte::Cont => { let mut start = idx; let mut checked = 0; loop { if start == 0 { // Whoops, fell off the beginning. return Some(Codepoint { bytes: unsafe_slice(buf, 0, idx + 1), rewind: idx, meaning: Meaning::Suffix, }); } start -= 1; checked += 1; match otry!(Byte::classify(*buf.get_unchecked(start))) { Byte::Cont => (), Byte::Start(n) => { let avail = buf.len() - start; if avail >= n { let bytes = unsafe_slice(buf, start, n); if checked < n { if !all_cont(unsafe_slice(bytes, checked, n-checked)) { return None; } } let meaning = otry!(decode(bytes)); return Some(Codepoint { bytes: bytes, rewind: idx - start, meaning: meaning, }); } else { return Some(Codepoint { bytes: unsafe_slice(buf, start, avail), rewind: idx - start, meaning: Meaning::Prefix(n - avail), }); } } _ => return None, } if idx - start >= 3 { // We looked at 3 bytes before a continuation byte // and didn't find a start byte. return None; } } } } } } #[cfg(test)] mod test;