summaryrefslogtreecommitdiffstats
path: root/vendor/utf8parse/src
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/utf8parse/src')
-rw-r--r--vendor/utf8parse/src/lib.rs132
-rw-r--r--vendor/utf8parse/src/types.rs105
2 files changed, 237 insertions, 0 deletions
diff --git a/vendor/utf8parse/src/lib.rs b/vendor/utf8parse/src/lib.rs
new file mode 100644
index 000000000..093de81e4
--- /dev/null
+++ b/vendor/utf8parse/src/lib.rs
@@ -0,0 +1,132 @@
+//! A table-driven UTF-8 Parser
+//!
+//! This module implements a table-driven UTF-8 parser which should
+//! theoretically contain the minimal number of branches (1). The only branch is
+//! on the `Action` returned from unpacking a transition.
+#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)]
+#![cfg_attr(all(feature = "nightly", test), feature(test))]
+#![no_std]
+
+use core::char;
+
+mod types;
+
+use types::{Action, State};
+
+/// Handles codepoint and invalid sequence events from the parser.
+pub trait Receiver {
+ /// Called whenever a codepoint is parsed successfully
+ fn codepoint(&mut self, _: char);
+
+ /// Called when an invalid_sequence is detected
+ fn invalid_sequence(&mut self);
+}
+
+/// A parser for Utf8 Characters
+///
+/// Repeatedly call `advance` with bytes to emit Utf8 characters
+#[derive(Clone, Default, PartialEq, Eq, Debug)]
+pub struct Parser {
+ point: u32,
+ state: State,
+}
+
+/// Continuation bytes are masked with this value.
+const CONTINUATION_MASK: u8 = 0b0011_1111;
+
+impl Parser {
+ /// Create a new Parser
+ pub fn new() -> Parser {
+ Parser { point: 0, state: State::Ground }
+ }
+
+ /// Advance the parser
+ ///
+ /// The provider receiver will be called whenever a codepoint is completed or an invalid
+ /// sequence is detected.
+ pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
+ where
+ R: Receiver,
+ {
+ let (state, action) = self.state.advance(byte);
+ self.perform_action(receiver, byte, action);
+ self.state = state;
+ }
+
+ fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
+ where
+ R: Receiver,
+ {
+ match action {
+ Action::InvalidSequence => {
+ self.point = 0;
+ receiver.invalid_sequence();
+ },
+ Action::EmitByte => {
+ receiver.codepoint(byte as char);
+ },
+ Action::SetByte1 => {
+ let point = self.point | ((byte & CONTINUATION_MASK) as u32);
+ let c = unsafe { char::from_u32_unchecked(point) };
+ self.point = 0;
+
+ receiver.codepoint(c);
+ },
+ Action::SetByte2 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
+ },
+ Action::SetByte2Top => {
+ self.point |= ((byte & 0b0001_1111) as u32) << 6;
+ },
+ Action::SetByte3 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
+ },
+ Action::SetByte3Top => {
+ self.point |= ((byte & 0b0000_1111) as u32) << 12;
+ },
+ Action::SetByte4 => {
+ self.point |= ((byte & 0b0000_0111) as u32) << 18;
+ },
+ }
+ }
+}
+
+#[cfg(all(feature = "nightly", test))]
+mod benches {
+ extern crate std;
+ extern crate test;
+
+ use super::{Parser, Receiver};
+
+ use self::test::{black_box, Bencher};
+
+ static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt");
+
+ impl Receiver for () {
+ fn codepoint(&mut self, c: char) {
+ black_box(c);
+ }
+
+ fn invalid_sequence(&mut self) {}
+ }
+
+ #[bench]
+ fn parse_bench_utf8_demo(b: &mut Bencher) {
+ let mut parser = Parser::new();
+
+ b.iter(|| {
+ for byte in UTF8_DEMO {
+ parser.advance(&mut (), *byte);
+ }
+ })
+ }
+
+ #[bench]
+ fn std_string_parse_utf8(b: &mut Bencher) {
+ b.iter(|| {
+ for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() {
+ black_box(c);
+ }
+ });
+ }
+}
diff --git a/vendor/utf8parse/src/types.rs b/vendor/utf8parse/src/types.rs
new file mode 100644
index 000000000..f57a94d6f
--- /dev/null
+++ b/vendor/utf8parse/src/types.rs
@@ -0,0 +1,105 @@
+//! Types supporting the UTF-8 parser
+
+/// Action to take when receiving a byte
+#[derive(Debug, Copy, Clone)]
+pub enum Action {
+ /// Unexpected byte; sequence is invalid
+ InvalidSequence = 0,
+ /// Received valid 7-bit ASCII byte which can be directly emitted.
+ EmitByte = 1,
+ /// Set the bottom continuation byte
+ SetByte1 = 2,
+ /// Set the 2nd-from-last continuation byte
+ SetByte2 = 3,
+ /// Set the 2nd-from-last byte which is part of a two byte sequence
+ SetByte2Top = 4,
+ /// Set the 3rd-from-last continuation byte
+ SetByte3 = 5,
+ /// Set the 3rd-from-last byte which is part of a three byte sequence
+ SetByte3Top = 6,
+ /// Set the top byte of a four byte sequence.
+ SetByte4 = 7,
+}
+
+/// States the parser can be in.
+///
+/// There is a state for each initial input of the 3 and 4 byte sequences since
+/// the following bytes are subject to different conditions than a tail byte.
+#[allow(non_camel_case_types)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum State {
+ /// Ground state; expect anything
+ Ground = 0,
+ /// 3 tail bytes
+ Tail3 = 1,
+ /// 2 tail bytes
+ Tail2 = 2,
+ /// 1 tail byte
+ Tail1 = 3,
+ /// UTF8-3 starting with E0
+ U3_2_e0 = 4,
+ /// UTF8-3 starting with ED
+ U3_2_ed = 5,
+ /// UTF8-4 starting with F0
+ Utf8_4_3_f0 = 6,
+ /// UTF8-4 starting with F4
+ Utf8_4_3_f4 = 7,
+}
+
+impl Default for State {
+ fn default() -> State {
+ State::Ground
+ }
+}
+
+impl State {
+ /// Advance the parser state.
+ ///
+ /// This takes the current state and input byte into consideration, to determine the next state
+ /// and any action that should be taken.
+ #[inline]
+ pub fn advance(self, byte: u8) -> (State, Action) {
+ match self {
+ State::Ground => match byte {
+ 0x00..=0x7f => (State::Ground, Action::EmitByte),
+ 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top),
+ 0xe0 => (State::U3_2_e0, Action::SetByte3Top),
+ 0xe1..=0xec => (State::Tail2, Action::SetByte3Top),
+ 0xed => (State::U3_2_ed, Action::SetByte3Top),
+ 0xee..=0xef => (State::Tail2, Action::SetByte3Top),
+ 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
+ 0xf1..=0xf3 => (State::Tail3, Action::SetByte4),
+ 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::U3_2_e0 => match byte {
+ 0xa0..=0xbf => (State::Tail1, Action::SetByte2),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::U3_2_ed => match byte {
+ 0x80..=0x9f => (State::Tail1, Action::SetByte2),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::Utf8_4_3_f0 => match byte {
+ 0x90..=0xbf => (State::Tail2, Action::SetByte3),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::Utf8_4_3_f4 => match byte {
+ 0x80..=0x8f => (State::Tail2, Action::SetByte3),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::Tail3 => match byte {
+ 0x80..=0xbf => (State::Tail2, Action::SetByte3),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::Tail2 => match byte {
+ 0x80..=0xbf => (State::Tail1, Action::SetByte2),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ State::Tail1 => match byte {
+ 0x80..=0xbf => (State::Ground, Action::SetByte1),
+ _ => (State::Ground, Action::InvalidSequence),
+ },
+ }
+ }
+}