diff options
Diffstat (limited to 'third_party/rust/regex/src/input.rs')
-rw-r--r-- | third_party/rust/regex/src/input.rs | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/input.rs b/third_party/rust/regex/src/input.rs new file mode 100644 index 0000000000..df6c3e0c91 --- /dev/null +++ b/third_party/rust/regex/src/input.rs @@ -0,0 +1,432 @@ +use std::char; +use std::cmp::Ordering; +use std::fmt; +use std::ops; +use std::u32; + +use crate::literal::LiteralSearcher; +use crate::prog::InstEmptyLook; +use crate::utf8::{decode_last_utf8, decode_utf8}; + +/// Represents a location in the input. +#[derive(Clone, Copy, Debug)] +pub struct InputAt { + pos: usize, + c: Char, + byte: Option<u8>, + len: usize, +} + +impl InputAt { + /// Returns true iff this position is at the beginning of the input. + pub fn is_start(&self) -> bool { + self.pos == 0 + } + + /// Returns true iff this position is past the end of the input. + pub fn is_end(&self) -> bool { + self.c.is_none() && self.byte.is_none() + } + + /// Returns the character at this position. + /// + /// If this position is just before or after the input, then an absent + /// character is returned. + pub fn char(&self) -> Char { + self.c + } + + /// Returns the byte at this position. + pub fn byte(&self) -> Option<u8> { + self.byte + } + + /// Returns the UTF-8 width of the character at this position. + pub fn len(&self) -> usize { + self.len + } + + /// Returns whether the UTF-8 width of the character at this position + /// is zero. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the byte offset of this position. + pub fn pos(&self) -> usize { + self.pos + } + + /// Returns the byte offset of the next position in the input. + pub fn next_pos(&self) -> usize { + self.pos + self.len + } +} + +/// An abstraction over input used in the matching engines. +pub trait Input: fmt::Debug { + /// Return an encoding of the position at byte offset `i`. + fn at(&self, i: usize) -> InputAt; + + /// Return the Unicode character occurring next to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn next_char(&self, at: InputAt) -> Char; + + /// Return the Unicode character occurring previous to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn previous_char(&self, at: InputAt) -> Char; + + /// Return true if the given empty width instruction matches at the + /// input position given. + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + + /// Scan the input for a matching prefix. + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt>; + + /// The number of bytes in the input. + fn len(&self) -> usize; + + /// Whether the input is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the given input as a sequence of bytes. + fn as_bytes(&self) -> &[u8]; +} + +impl<'a, T: Input> Input for &'a T { + fn at(&self, i: usize) -> InputAt { + (**self).at(i) + } + + fn next_char(&self, at: InputAt) -> Char { + (**self).next_char(at) + } + + fn previous_char(&self, at: InputAt) -> Char { + (**self).previous_char(at) + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + (**self).is_empty_match(at, empty) + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt> { + (**self).prefix_at(prefixes, at) + } + + fn len(&self) -> usize { + (**self).len() + } + + fn as_bytes(&self) -> &[u8] { + (**self).as_bytes() + } +} + +/// An input reader over characters. +#[derive(Clone, Copy, Debug)] +pub struct CharInput<'t>(&'t [u8]); + +impl<'t> CharInput<'t> { + /// Return a new character input reader for the given string. + pub fn new(s: &'t [u8]) -> CharInput<'t> { + CharInput(s) + } +} + +impl<'t> ops::Deref for CharInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.0 + } +} + +impl<'t> Input for CharInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); + InputAt { pos: i, c, byte: None, len: c.len_utf8() } + } + } + + fn next_char(&self, at: InputAt) -> Char { + at.char() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt> { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn as_bytes(&self) -> &[u8] { + self.0 + } +} + +/// An input reader over bytes. +#[derive(Clone, Copy, Debug)] +pub struct ByteInput<'t> { + text: &'t [u8], + only_utf8: bool, +} + +impl<'t> ByteInput<'t> { + /// Return a new byte-based input reader for the given string. + pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { + ByteInput { text, only_utf8 } + } +} + +impl<'t> ops::Deref for ByteInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.text + } +} + +impl<'t> Input for ByteInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + InputAt { + pos: i, + c: None.into(), + byte: self.get(i).cloned(), + len: 1, + } + } + } + + fn next_char(&self, at: InputAt) -> Char { + decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt> { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.text.len() + } + + fn as_bytes(&self) -> &[u8] { + self.text + } +} + +/// An inline representation of `Option<char>`. +/// +/// This eliminates the need to do case analysis on `Option<char>` to determine +/// ordinality with other characters. +/// +/// (The `Option<char>` is not related to encoding. Instead, it is used in the +/// matching engines to represent the beginning and ending boundaries of the +/// search text.) +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Char(u32); + +impl fmt::Debug for Char { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match char::from_u32(self.0) { + None => write!(f, "Empty"), + Some(c) => write!(f, "{:?}", c), + } + } +} + +impl Char { + /// Returns true iff the character is absent. + #[inline] + pub fn is_none(self) -> bool { + self.0 == u32::MAX + } + + /// Returns the length of the character's UTF-8 encoding. + /// + /// If the character is absent, then `1` is returned. + #[inline] + pub fn len_utf8(self) -> usize { + char::from_u32(self.0).map_or(1, |c| c.len_utf8()) + } + + /// Returns true iff the character is a word character. + /// + /// If the character is absent, then false is returned. + pub fn is_word_char(self) -> bool { + // is_word_character can panic if the Unicode data for \w isn't + // available. However, our compiler ensures that if a Unicode word + // boundary is used, then the data must also be available. If it isn't, + // then the compiler returns an error. + char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) + } + + /// Returns true iff the byte is a word byte. + /// + /// If the byte is absent, then false is returned. + pub fn is_word_byte(self) -> bool { + match char::from_u32(self.0) { + Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), + None | Some(_) => false, + } + } +} + +impl From<char> for Char { + fn from(c: char) -> Char { + Char(c as u32) + } +} + +impl From<Option<char>> for Char { + fn from(c: Option<char>) -> Char { + c.map_or(Char(u32::MAX), |c| c.into()) + } +} + +impl PartialEq<char> for Char { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other as u32 + } +} + +impl PartialEq<Char> for char { + #[inline] + fn eq(&self, other: &Char) -> bool { + *self as u32 == other.0 + } +} + +impl PartialOrd<char> for Char { + #[inline] + fn partial_cmp(&self, other: &char) -> Option<Ordering> { + self.0.partial_cmp(&(*other as u32)) + } +} + +impl PartialOrd<Char> for char { + #[inline] + fn partial_cmp(&self, other: &Char) -> Option<Ordering> { + (*self as u32).partial_cmp(&other.0) + } +} |