summaryrefslogtreecommitdiffstats
path: root/vendor/futf/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/futf/src/lib.rs')
-rw-r--r--vendor/futf/src/lib.rs248
1 files changed, 248 insertions, 0 deletions
diff --git a/vendor/futf/src/lib.rs b/vendor/futf/src/lib.rs
new file mode 100644
index 000000000..4b94a35a5
--- /dev/null
+++ b/vendor/futf/src/lib.rs
@@ -0,0 +1,248 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![cfg_attr(test, feature(test))]
+
+#[macro_use]
+extern crate debug_unreachable;
+
+#[macro_use]
+extern crate mac;
+
+#[cfg(test)]
+extern crate test as std_test;
+
+use std::{slice, char};
+
+/// Meaning of a complete or partial UTF-8 codepoint.
+///
+/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
+/// `Suffix` may in reality have no valid completion.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
+pub enum Meaning {
+ /// We found a whole codepoint.
+ Whole(char),
+
+ /// We found something that isn't a valid Unicode codepoint, but
+ /// it *would* correspond to a UTF-16 leading surrogate code unit,
+ /// i.e. a value in the range `U+D800` - `U+DBFF`.
+ ///
+ /// The argument is the code unit's 10-bit index within that range.
+ ///
+ /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
+ LeadSurrogate(u16),
+
+ /// We found something that isn't a valid Unicode codepoint, but
+ /// it *would* correspond to a UTF-16 trailing surrogate code unit,
+ /// i.e. a value in the range `U+DC00` - `U+DFFF`.
+ ///
+ /// The argument is the code unit's 10-bit index within that range.
+ ///
+ /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
+ TrailSurrogate(u16),
+
+ /// We found only a prefix of a codepoint before the buffer ended.
+ ///
+ /// Includes the number of additional bytes needed.
+ Prefix(usize),
+
+ /// We found only a suffix of a codepoint before running off the
+ /// start of the buffer.
+ ///
+ /// Up to 3 more bytes may be needed.
+ Suffix,
+}
+
+/// Represents a complete or partial UTF-8 codepoint.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
+pub struct Codepoint<'a> {
+ /// The bytes that make up the partial or full codepoint.
+ ///
+ /// For a `Suffix` this depends on `idx`. We don't scan forward
+ /// for additional continuation bytes after the reverse scan
+ /// failed to locate a multibyte sequence start.
+ pub bytes: &'a [u8],
+
+ /// Start of the codepoint in the buffer, expressed as an offset
+ /// back from `idx`.
+ pub rewind: usize,
+
+ /// Meaning of the partial or full codepoint.
+ pub meaning: Meaning,
+}
+
+#[derive(Debug, PartialEq, Eq)]
+enum Byte {
+ Ascii,
+ Start(usize),
+ Cont,
+}
+
+impl Byte {
+ #[inline(always)]
+ fn classify(x: u8) -> Option<Byte> {
+ match x & 0xC0 {
+ 0xC0 => match x {
+ x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
+ x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
+ x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
+ _ => None,
+ },
+ 0x80 => Some(Byte::Cont),
+ _ => Some(Byte::Ascii),
+ }
+ }
+}
+
+#[inline(always)]
+fn all_cont(buf: &[u8]) -> bool {
+ buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
+}
+
+// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
+// a starting byte followed by the correct number of continuation bytes.
+#[inline(always)]
+unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
+ debug_assert!(buf.len() >= 2);
+ debug_assert!(buf.len() <= 4);
+ let n;
+ match buf.len() {
+ 2 => {
+ n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
+ | ((*buf.get_unchecked(1) & 0x3F) as u32);
+ if n < 0x80 { return None } // Overlong
+ }
+ 3 => {
+ n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
+ | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
+ | ((*buf.get_unchecked(2) & 0x3F) as u32);
+ match n {
+ 0x0000 ... 0x07FF => return None, // Overlong
+ 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
+ 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
+ _ => {}
+ }
+ }
+ 4 => {
+ n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
+ | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
+ | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
+ | ((*buf.get_unchecked(3) & 0x3F) as u32);
+ if n < 0x1_0000 { return None } // Overlong
+ }
+ _ => debug_unreachable!(),
+ }
+
+ char::from_u32(n).map(Meaning::Whole)
+}
+
+#[inline(always)]
+unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
+ debug_assert!(start <= buf.len());
+ debug_assert!(new_len <= (buf.len() - start));
+ slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
+}
+
+macro_rules! otry {
+ ($x:expr) => { unwrap_or_return!($x, None) }
+}
+
+/// Describes the UTF-8 codepoint containing the byte at index `idx` within
+/// `buf`.
+///
+/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
+/// in the vicinity of `idx`.
+#[inline]
+pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
+ if idx >= buf.len() {
+ return None;
+ }
+
+ unsafe {
+ let x = *buf.get_unchecked(idx);
+ match otry!(Byte::classify(x)) {
+ Byte::Ascii => Some(Codepoint {
+ bytes: unsafe_slice(buf, idx, 1),
+ rewind: 0,
+ meaning: Meaning::Whole(x as char),
+ }),
+ Byte::Start(n) => {
+ let avail = buf.len() - idx;
+ if avail >= n {
+ let bytes = unsafe_slice(buf, idx, n);
+ if !all_cont(unsafe_slice(bytes, 1, n-1)) {
+ return None;
+ }
+ let meaning = otry!(decode(bytes));
+ Some(Codepoint {
+ bytes: bytes,
+ rewind: 0,
+ meaning: meaning,
+ })
+ } else {
+ Some(Codepoint {
+ bytes: unsafe_slice(buf, idx, avail),
+ rewind: 0,
+ meaning: Meaning::Prefix(n - avail),
+ })
+ }
+ },
+ Byte::Cont => {
+ let mut start = idx;
+ let mut checked = 0;
+ loop {
+ if start == 0 {
+ // Whoops, fell off the beginning.
+ return Some(Codepoint {
+ bytes: unsafe_slice(buf, 0, idx + 1),
+ rewind: idx,
+ meaning: Meaning::Suffix,
+ });
+ }
+
+ start -= 1;
+ checked += 1;
+ match otry!(Byte::classify(*buf.get_unchecked(start))) {
+ Byte::Cont => (),
+ Byte::Start(n) => {
+ let avail = buf.len() - start;
+ if avail >= n {
+ let bytes = unsafe_slice(buf, start, n);
+ if checked < n {
+ if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
+ return None;
+ }
+ }
+ let meaning = otry!(decode(bytes));
+ return Some(Codepoint {
+ bytes: bytes,
+ rewind: idx - start,
+ meaning: meaning,
+ });
+ } else {
+ return Some(Codepoint {
+ bytes: unsafe_slice(buf, start, avail),
+ rewind: idx - start,
+ meaning: Meaning::Prefix(n - avail),
+ });
+ }
+ }
+ _ => return None,
+ }
+
+ if idx - start >= 3 {
+ // We looked at 3 bytes before a continuation byte
+ // and didn't find a start byte.
+ return None;
+ }
+ }
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod test;