summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/input.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/regex/src/input.rs
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex/src/input.rs')
-rw-r--r--third_party/rust/regex/src/input.rs432
1 files changed, 432 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/input.rs b/third_party/rust/regex/src/input.rs
new file mode 100644
index 0000000000..df6c3e0c91
--- /dev/null
+++ b/third_party/rust/regex/src/input.rs
@@ -0,0 +1,432 @@
+use std::char;
+use std::cmp::Ordering;
+use std::fmt;
+use std::ops;
+use std::u32;
+
+use crate::literal::LiteralSearcher;
+use crate::prog::InstEmptyLook;
+use crate::utf8::{decode_last_utf8, decode_utf8};
+
+/// Represents a location in the input.
+#[derive(Clone, Copy, Debug)]
+pub struct InputAt {
+ pos: usize,
+ c: Char,
+ byte: Option<u8>,
+ len: usize,
+}
+
+impl InputAt {
+ /// Returns true iff this position is at the beginning of the input.
+ pub fn is_start(&self) -> bool {
+ self.pos == 0
+ }
+
+ /// Returns true iff this position is past the end of the input.
+ pub fn is_end(&self) -> bool {
+ self.c.is_none() && self.byte.is_none()
+ }
+
+ /// Returns the character at this position.
+ ///
+ /// If this position is just before or after the input, then an absent
+ /// character is returned.
+ pub fn char(&self) -> Char {
+ self.c
+ }
+
+ /// Returns the byte at this position.
+ pub fn byte(&self) -> Option<u8> {
+ self.byte
+ }
+
+ /// Returns the UTF-8 width of the character at this position.
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns whether the UTF-8 width of the character at this position
+ /// is zero.
+ pub fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ /// Returns the byte offset of this position.
+ pub fn pos(&self) -> usize {
+ self.pos
+ }
+
+ /// Returns the byte offset of the next position in the input.
+ pub fn next_pos(&self) -> usize {
+ self.pos + self.len
+ }
+}
+
+/// An abstraction over input used in the matching engines.
+pub trait Input: fmt::Debug {
+ /// Return an encoding of the position at byte offset `i`.
+ fn at(&self, i: usize) -> InputAt;
+
+ /// Return the Unicode character occurring next to `at`.
+ ///
+ /// If no such character could be decoded, then `Char` is absent.
+ fn next_char(&self, at: InputAt) -> Char;
+
+ /// Return the Unicode character occurring previous to `at`.
+ ///
+ /// If no such character could be decoded, then `Char` is absent.
+ fn previous_char(&self, at: InputAt) -> Char;
+
+ /// Return true if the given empty width instruction matches at the
+ /// input position given.
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
+
+ /// Scan the input for a matching prefix.
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt>;
+
+ /// The number of bytes in the input.
+ fn len(&self) -> usize;
+
+ /// Whether the input is empty.
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Return the given input as a sequence of bytes.
+ fn as_bytes(&self) -> &[u8];
+}
+
+impl<'a, T: Input> Input for &'a T {
+ fn at(&self, i: usize) -> InputAt {
+ (**self).at(i)
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ (**self).next_char(at)
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ (**self).previous_char(at)
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ (**self).is_empty_match(at, empty)
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ (**self).prefix_at(prefixes, at)
+ }
+
+ fn len(&self) -> usize {
+ (**self).len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ (**self).as_bytes()
+ }
+}
+
+/// An input reader over characters.
+#[derive(Clone, Copy, Debug)]
+pub struct CharInput<'t>(&'t [u8]);
+
+impl<'t> CharInput<'t> {
+ /// Return a new character input reader for the given string.
+ pub fn new(s: &'t [u8]) -> CharInput<'t> {
+ CharInput(s)
+ }
+}
+
+impl<'t> ops::Deref for CharInput<'t> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ self.0
+ }
+}
+
+impl<'t> Input for CharInput<'t> {
+ fn at(&self, i: usize) -> InputAt {
+ if i >= self.len() {
+ InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
+ } else {
+ let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
+ InputAt { pos: i, c, byte: None, len: c.len_utf8() }
+ }
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ at.char()
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ use crate::prog::EmptyLook::*;
+ match empty.look {
+ StartLine => {
+ let c = self.previous_char(at);
+ at.pos() == 0 || c == '\n'
+ }
+ EndLine => {
+ let c = self.next_char(at);
+ at.pos() == self.len() || c == '\n'
+ }
+ StartText => at.pos() == 0,
+ EndText => at.pos() == self.len(),
+ WordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() != c2.is_word_char()
+ }
+ NotWordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() == c2.is_word_char()
+ }
+ WordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_byte() != c2.is_word_byte()
+ }
+ NotWordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_byte() == c2.is_word_byte()
+ }
+ }
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
+ }
+
+ fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self.0
+ }
+}
+
+/// An input reader over bytes.
+#[derive(Clone, Copy, Debug)]
+pub struct ByteInput<'t> {
+ text: &'t [u8],
+ only_utf8: bool,
+}
+
+impl<'t> ByteInput<'t> {
+ /// Return a new byte-based input reader for the given string.
+ pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
+ ByteInput { text, only_utf8 }
+ }
+}
+
+impl<'t> ops::Deref for ByteInput<'t> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ self.text
+ }
+}
+
+impl<'t> Input for ByteInput<'t> {
+ fn at(&self, i: usize) -> InputAt {
+ if i >= self.len() {
+ InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
+ } else {
+ InputAt {
+ pos: i,
+ c: None.into(),
+ byte: self.get(i).cloned(),
+ len: 1,
+ }
+ }
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ use crate::prog::EmptyLook::*;
+ match empty.look {
+ StartLine => {
+ let c = self.previous_char(at);
+ at.pos() == 0 || c == '\n'
+ }
+ EndLine => {
+ let c = self.next_char(at);
+ at.pos() == self.len() || c == '\n'
+ }
+ StartText => at.pos() == 0,
+ EndText => at.pos() == self.len(),
+ WordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() != c2.is_word_char()
+ }
+ NotWordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() == c2.is_word_char()
+ }
+ WordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ if self.only_utf8 {
+ // If we must match UTF-8, then we can't match word
+ // boundaries at invalid UTF-8.
+ if c1.is_none() && !at.is_start() {
+ return false;
+ }
+ if c2.is_none() && !at.is_end() {
+ return false;
+ }
+ }
+ c1.is_word_byte() != c2.is_word_byte()
+ }
+ NotWordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ if self.only_utf8 {
+ // If we must match UTF-8, then we can't match word
+ // boundaries at invalid UTF-8.
+ if c1.is_none() && !at.is_start() {
+ return false;
+ }
+ if c2.is_none() && !at.is_end() {
+ return false;
+ }
+ }
+ c1.is_word_byte() == c2.is_word_byte()
+ }
+ }
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
+ }
+
+ fn len(&self) -> usize {
+ self.text.len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self.text
+ }
+}
+
+/// An inline representation of `Option<char>`.
+///
+/// This eliminates the need to do case analysis on `Option<char>` to determine
+/// ordinality with other characters.
+///
+/// (The `Option<char>` is not related to encoding. Instead, it is used in the
+/// matching engines to represent the beginning and ending boundaries of the
+/// search text.)
+#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Char(u32);
+
+impl fmt::Debug for Char {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match char::from_u32(self.0) {
+ None => write!(f, "Empty"),
+ Some(c) => write!(f, "{:?}", c),
+ }
+ }
+}
+
+impl Char {
+ /// Returns true iff the character is absent.
+ #[inline]
+ pub fn is_none(self) -> bool {
+ self.0 == u32::MAX
+ }
+
+ /// Returns the length of the character's UTF-8 encoding.
+ ///
+ /// If the character is absent, then `1` is returned.
+ #[inline]
+ pub fn len_utf8(self) -> usize {
+ char::from_u32(self.0).map_or(1, |c| c.len_utf8())
+ }
+
+ /// Returns true iff the character is a word character.
+ ///
+ /// If the character is absent, then false is returned.
+ pub fn is_word_char(self) -> bool {
+ // is_word_character can panic if the Unicode data for \w isn't
+ // available. However, our compiler ensures that if a Unicode word
+ // boundary is used, then the data must also be available. If it isn't,
+ // then the compiler returns an error.
+ char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
+ }
+
+ /// Returns true iff the byte is a word byte.
+ ///
+ /// If the byte is absent, then false is returned.
+ pub fn is_word_byte(self) -> bool {
+ match char::from_u32(self.0) {
+ Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
+ None | Some(_) => false,
+ }
+ }
+}
+
+impl From<char> for Char {
+ fn from(c: char) -> Char {
+ Char(c as u32)
+ }
+}
+
+impl From<Option<char>> for Char {
+ fn from(c: Option<char>) -> Char {
+ c.map_or(Char(u32::MAX), |c| c.into())
+ }
+}
+
+impl PartialEq<char> for Char {
+ #[inline]
+ fn eq(&self, other: &char) -> bool {
+ self.0 == *other as u32
+ }
+}
+
+impl PartialEq<Char> for char {
+ #[inline]
+ fn eq(&self, other: &Char) -> bool {
+ *self as u32 == other.0
+ }
+}
+
+impl PartialOrd<char> for Char {
+ #[inline]
+ fn partial_cmp(&self, other: &char) -> Option<Ordering> {
+ self.0.partial_cmp(&(*other as u32))
+ }
+}
+
+impl PartialOrd<Char> for char {
+ #[inline]
+ fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
+ (*self as u32).partial_cmp(&other.0)
+ }
+}