Adding upstream version 115.7.0esr.upstream/115.7.0esr

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/regex/src/compile.rs
parent: Initial commit. (diff)
download: firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
1 files changed, 1264 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/compile.rs b/third_party/rust/regex/src/compile.rs
new file mode 100644
index 0000000000..90ca25015f
--- /dev/null
+++ b/third_party/rust/regex/src/compile.rs
@@ -0,0 +1,1264 @@
+use std::collections::HashMap;
+use std::fmt;
+use std::iter;
+use std::result;
+use std::sync::Arc;
+
+use regex_syntax::hir::{self, Hir};
+use regex_syntax::is_word_byte;
+use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
+
+use crate::prog::{
+    EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
+    InstSave, InstSplit, Program,
+};
+
+use crate::Error;
+
+type Result = result::Result<Patch, Error>;
+type ResultOrEmpty = result::Result<Option<Patch>, Error>;
+
+#[derive(Debug)]
+struct Patch {
+    hole: Hole,
+    entry: InstPtr,
+}
+
+/// A compiler translates a regular expression AST to a sequence of
+/// instructions. The sequence of instructions represents an NFA.
+// `Compiler` is only public via the `internal` module, so avoid deriving
+// `Debug`.
+#[allow(missing_debug_implementations)]
+pub struct Compiler {
+    insts: Vec<MaybeInst>,
+    compiled: Program,
+    capture_name_idx: HashMap<String, usize>,
+    num_exprs: usize,
+    size_limit: usize,
+    suffix_cache: SuffixCache,
+    utf8_seqs: Option<Utf8Sequences>,
+    byte_classes: ByteClassSet,
+    // This keeps track of extra bytes allocated while compiling the regex
+    // program. Currently, this corresponds to two things. First is the heap
+    // memory allocated by Unicode character classes ('InstRanges'). Second is
+    // a "fake" amount of memory used by empty sub-expressions, so that enough
+    // empty sub-expressions will ultimately trigger the compiler to bail
+    // because of a size limit restriction. (That empty sub-expressions don't
+    // add to heap memory usage is more-or-less an implementation detail.) In
+    // the second case, if we don't bail, then an excessively large repetition
+    // on an empty sub-expression can result in the compiler using a very large
+    // amount of CPU time.
+    extra_inst_bytes: usize,
+}
+
+impl Compiler {
+    /// Create a new regular expression compiler.
+    ///
+    /// Various options can be set before calling `compile` on an expression.
+    pub fn new() -> Self {
+        Compiler {
+            insts: vec![],
+            compiled: Program::new(),
+            capture_name_idx: HashMap::new(),
+            num_exprs: 0,
+            size_limit: 10 * (1 << 20),
+            suffix_cache: SuffixCache::new(1000),
+            utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
+            byte_classes: ByteClassSet::new(),
+            extra_inst_bytes: 0,
+        }
+    }
+
+    /// The size of the resulting program is limited by size_limit. If
+    /// the program approximately exceeds the given size (in bytes), then
+    /// compilation will stop and return an error.
+    pub fn size_limit(mut self, size_limit: usize) -> Self {
+        self.size_limit = size_limit;
+        self
+    }
+
+    /// If bytes is true, then the program is compiled as a byte based
+    /// automaton, which incorporates UTF-8 decoding into the machine. If it's
+    /// false, then the automaton is Unicode scalar value based, e.g., an
+    /// engine utilizing such an automaton is responsible for UTF-8 decoding.
+    ///
+    /// The specific invariant is that when returning a byte based machine,
+    /// the neither the `Char` nor `Ranges` instructions are produced.
+    /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
+    /// instruction is never produced.
+    ///
+    /// Note that `dfa(true)` implies `bytes(true)`.
+    pub fn bytes(mut self, yes: bool) -> Self {
+        self.compiled.is_bytes = yes;
+        self
+    }
+
+    /// When disabled, the program compiled may match arbitrary bytes.
+    ///
+    /// When enabled (the default), all compiled programs exclusively match
+    /// valid UTF-8 bytes.
+    pub fn only_utf8(mut self, yes: bool) -> Self {
+        self.compiled.only_utf8 = yes;
+        self
+    }
+
+    /// When set, the machine returned is suitable for use in the DFA matching
+    /// engine.
+    ///
+    /// In particular, this ensures that if the regex is not anchored in the
+    /// beginning, then a preceding `.*?` is included in the program. (The NFA
+    /// based engines handle the preceding `.*?` explicitly, which is difficult
+    /// or impossible in the DFA engine.)
+    pub fn dfa(mut self, yes: bool) -> Self {
+        self.compiled.is_dfa = yes;
+        self
+    }
+
+    /// When set, the machine returned is suitable for matching text in
+    /// reverse. In particular, all concatenations are flipped.
+    pub fn reverse(mut self, yes: bool) -> Self {
+        self.compiled.is_reverse = yes;
+        self
+    }
+
+    /// Compile a regular expression given its AST.
+    ///
+    /// The compiler is guaranteed to succeed unless the program exceeds the
+    /// specified size limit. If the size limit is exceeded, then compilation
+    /// stops and returns an error.
+    pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
+        debug_assert!(!exprs.is_empty());
+        self.num_exprs = exprs.len();
+        if exprs.len() == 1 {
+            self.compile_one(&exprs[0])
+        } else {
+            self.compile_many(exprs)
+        }
+    }
+
+    fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+        // If we're compiling a forward DFA and we aren't anchored, then
+        // add a `.*?` before the first capture group.
+        // Other matching engines handle this by baking the logic into the
+        // matching engine itself.
+        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
+        self.compiled.is_anchored_start = expr.is_anchored_start();
+        self.compiled.is_anchored_end = expr.is_anchored_end();
+        if self.compiled.needs_dotstar() {
+            dotstar_patch = self.c_dotstar()?;
+            self.compiled.start = dotstar_patch.entry;
+        }
+        self.compiled.captures = vec![None];
+        let patch =
+            self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
+        if self.compiled.needs_dotstar() {
+            self.fill(dotstar_patch.hole, patch.entry);
+        } else {
+            self.compiled.start = patch.entry;
+        }
+        self.fill_to_next(patch.hole);
+        self.compiled.matches = vec![self.insts.len()];
+        self.push_compiled(Inst::Match(0));
+        self.compile_finish()
+    }
+
+    fn compile_many(
+        mut self,
+        exprs: &[Hir],
+    ) -> result::Result<Program, Error> {
+        debug_assert!(exprs.len() > 1);
+
+        self.compiled.is_anchored_start =
+            exprs.iter().all(|e| e.is_anchored_start());
+        self.compiled.is_anchored_end =
+            exprs.iter().all(|e| e.is_anchored_end());
+        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
+        if self.compiled.needs_dotstar() {
+            dotstar_patch = self.c_dotstar()?;
+            self.compiled.start = dotstar_patch.entry;
+        } else {
+            self.compiled.start = 0; // first instruction is always split
+        }
+        self.fill_to_next(dotstar_patch.hole);
+
+        let mut prev_hole = Hole::None;
+        for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
+            self.fill_to_next(prev_hole);
+            let split = self.push_split_hole();
+            let Patch { hole, entry } =
+                self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
+            self.fill_to_next(hole);
+            self.compiled.matches.push(self.insts.len());
+            self.push_compiled(Inst::Match(i));
+            prev_hole = self.fill_split(split, Some(entry), None);
+        }
+        let i = exprs.len() - 1;
+        let Patch { hole, entry } =
+            self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
+        self.fill(prev_hole, entry);
+        self.fill_to_next(hole);
+        self.compiled.matches.push(self.insts.len());
+        self.push_compiled(Inst::Match(i));
+        self.compile_finish()
+    }
+
+    fn compile_finish(mut self) -> result::Result<Program, Error> {
+        self.compiled.insts =
+            self.insts.into_iter().map(|inst| inst.unwrap()).collect();
+        self.compiled.byte_classes = self.byte_classes.byte_classes();
+        self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
+        Ok(self.compiled)
+    }
+
+    /// Compile expr into self.insts, returning a patch on success,
+    /// or an error if we run out of memory.
+    ///
+    /// All of the c_* methods of the compiler share the contract outlined
+    /// here.
+    ///
+    /// The main thing that a c_* method does is mutate `self.insts`
+    /// to add a list of mostly compiled instructions required to execute
+    /// the given expression. `self.insts` contains MaybeInsts rather than
+    /// Insts because there is some backpatching required.
+    ///
+    /// The `Patch` value returned by each c_* method provides metadata
+    /// about the compiled instructions emitted to `self.insts`. The
+    /// `entry` member of the patch refers to the first instruction
+    /// (the entry point), while the `hole` member contains zero or
+    /// more offsets to partial instructions that need to be backpatched.
+    /// The c_* routine can't know where its list of instructions are going to
+    /// jump to after execution, so it is up to the caller to patch
+    /// these jumps to point to the right place. So compiling some
+    /// expression, e, we would end up with a situation that looked like:
+    ///
+    /// ```text
+    /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
+    ///                     ^              ^             ^
+    ///                     |                \         /
+    ///                   entry                \     /
+    ///                                         hole
+    /// ```
+    ///
+    /// To compile two expressions, e1 and e2, concatenated together we
+    /// would do:
+    ///
+    /// ```ignore
+    /// let patch1 = self.c(e1);
+    /// let patch2 = self.c(e2);
+    /// ```
+    ///
+    /// while leaves us with a situation that looks like
+    ///
+    /// ```text
+    /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
+    ///                     ^        ^            ^        ^
+    ///                     |        |            |        |
+    ///                entry1        hole1   entry2        hole2
+    /// ```
+    ///
+    /// Then to merge the two patches together into one we would backpatch
+    /// hole1 with entry2 and return a new patch that enters at entry1
+    /// and has hole2 for a hole. In fact, if you look at the c_concat
+    /// method you will see that it does exactly this, though it handles
+    /// a list of expressions rather than just the two that we use for
+    /// an example.
+    ///
+    /// Ok(None) is returned when an expression is compiled to no
+    /// instruction, and so no patch.entry value makes sense.
+    fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
+        use crate::prog;
+        use regex_syntax::hir::HirKind::*;
+
+        self.check_size()?;
+        match *expr.kind() {
+            Empty => self.c_empty(),
+            Literal(hir::Literal::Unicode(c)) => self.c_char(c),
+            Literal(hir::Literal::Byte(b)) => {
+                assert!(self.compiled.uses_bytes());
+                self.c_byte(b)
+            }
+            Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
+            Class(hir::Class::Bytes(ref cls)) => {
+                if self.compiled.uses_bytes() {
+                    self.c_class_bytes(cls.ranges())
+                } else {
+                    assert!(cls.is_all_ascii());
+                    let mut char_ranges = vec![];
+                    for r in cls.iter() {
+                        let (s, e) = (r.start() as char, r.end() as char);
+                        char_ranges.push(hir::ClassUnicodeRange::new(s, e));
+                    }
+                    self.c_class(&char_ranges)
+                }
+            }
+            Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
+                self.byte_classes.set_range(b'\n', b'\n');
+                self.c_empty_look(prog::EmptyLook::EndLine)
+            }
+            Anchor(hir::Anchor::StartLine) => {
+                self.byte_classes.set_range(b'\n', b'\n');
+                self.c_empty_look(prog::EmptyLook::StartLine)
+            }
+            Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
+                self.byte_classes.set_range(b'\n', b'\n');
+                self.c_empty_look(prog::EmptyLook::StartLine)
+            }
+            Anchor(hir::Anchor::EndLine) => {
+                self.byte_classes.set_range(b'\n', b'\n');
+                self.c_empty_look(prog::EmptyLook::EndLine)
+            }
+            Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
+                self.c_empty_look(prog::EmptyLook::EndText)
+            }
+            Anchor(hir::Anchor::StartText) => {
+                self.c_empty_look(prog::EmptyLook::StartText)
+            }
+            Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
+                self.c_empty_look(prog::EmptyLook::StartText)
+            }
+            Anchor(hir::Anchor::EndText) => {
+                self.c_empty_look(prog::EmptyLook::EndText)
+            }
+            WordBoundary(hir::WordBoundary::Unicode) => {
+                if !cfg!(feature = "unicode-perl") {
+                    return Err(Error::Syntax(
+                        "Unicode word boundaries are unavailable when \
+                         the unicode-perl feature is disabled"
+                            .to_string(),
+                    ));
+                }
+                self.compiled.has_unicode_word_boundary = true;
+                self.byte_classes.set_word_boundary();
+                // We also make sure that all ASCII bytes are in a different
+                // class from non-ASCII bytes. Otherwise, it's possible for
+                // ASCII bytes to get lumped into the same class as non-ASCII
+                // bytes. This in turn may cause the lazy DFA to falsely start
+                // when it sees an ASCII byte that maps to a byte class with
+                // non-ASCII bytes. This ensures that never happens.
+                self.byte_classes.set_range(0, 0x7F);
+                self.c_empty_look(prog::EmptyLook::WordBoundary)
+            }
+            WordBoundary(hir::WordBoundary::UnicodeNegate) => {
+                if !cfg!(feature = "unicode-perl") {
+                    return Err(Error::Syntax(
+                        "Unicode word boundaries are unavailable when \
+                         the unicode-perl feature is disabled"
+                            .to_string(),
+                    ));
+                }
+                self.compiled.has_unicode_word_boundary = true;
+                self.byte_classes.set_word_boundary();
+                // See comments above for why we set the ASCII range here.
+                self.byte_classes.set_range(0, 0x7F);
+                self.c_empty_look(prog::EmptyLook::NotWordBoundary)
+            }
+            WordBoundary(hir::WordBoundary::Ascii) => {
+                self.byte_classes.set_word_boundary();
+                self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
+            }
+            WordBoundary(hir::WordBoundary::AsciiNegate) => {
+                self.byte_classes.set_word_boundary();
+                self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
+            }
+            Group(ref g) => match g.kind {
+                hir::GroupKind::NonCapturing => self.c(&g.hir),
+                hir::GroupKind::CaptureIndex(index) => {
+                    if index as usize >= self.compiled.captures.len() {
+                        self.compiled.captures.push(None);
+                    }
+                    self.c_capture(2 * index as usize, &g.hir)
+                }
+                hir::GroupKind::CaptureName { index, ref name } => {
+                    if index as usize >= self.compiled.captures.len() {
+                        let n = name.to_string();
+                        self.compiled.captures.push(Some(n.clone()));
+                        self.capture_name_idx.insert(n, index as usize);
+                    }
+                    self.c_capture(2 * index as usize, &g.hir)
+                }
+            },
+            Concat(ref es) => {
+                if self.compiled.is_reverse {
+                    self.c_concat(es.iter().rev())
+                } else {
+                    self.c_concat(es)
+                }
+            }
+            Alternation(ref es) => self.c_alternate(&**es),
+            Repetition(ref rep) => self.c_repeat(rep),
+        }
+    }
+
+    fn c_empty(&mut self) -> ResultOrEmpty {
+        // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+        // See: CVE-2022-24713
+        //
+        // Since 'empty' sub-expressions don't increase the size of
+        // the actual compiled object, we "fake" an increase in its
+        // size so that our 'check_size_limit' routine will eventually
+        // stop compilation if there are too many empty sub-expressions
+        // (e.g., via a large repetition).
+        self.extra_inst_bytes += std::mem::size_of::<Inst>();
+        Ok(None)
+    }
+
+    fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
+        if self.num_exprs > 1 || self.compiled.is_dfa {
+            // Don't ever compile Save instructions for regex sets because
+            // they are never used. They are also never used in DFA programs
+            // because DFAs can't handle captures.
+            self.c(expr)
+        } else {
+            let entry = self.insts.len();
+            let hole = self.push_hole(InstHole::Save { slot: first_slot });
+            let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
+            self.fill(hole, patch.entry);
+            self.fill_to_next(patch.hole);
+            let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
+            Ok(Some(Patch { hole, entry }))
+        }
+    }
+
+    fn c_dotstar(&mut self) -> Result {
+        Ok(if !self.compiled.only_utf8() {
+            self.c(&Hir::repetition(hir::Repetition {
+                kind: hir::RepetitionKind::ZeroOrMore,
+                greedy: false,
+                hir: Box::new(Hir::any(true)),
+            }))?
+            .unwrap()
+        } else {
+            self.c(&Hir::repetition(hir::Repetition {
+                kind: hir::RepetitionKind::ZeroOrMore,
+                greedy: false,
+                hir: Box::new(Hir::any(false)),
+            }))?
+            .unwrap()
+        })
+    }
+
+    fn c_char(&mut self, c: char) -> ResultOrEmpty {
+        if self.compiled.uses_bytes() {
+            if c.is_ascii() {
+                let b = c as u8;
+                let hole =
+                    self.push_hole(InstHole::Bytes { start: b, end: b });
+                self.byte_classes.set_range(b, b);
+                Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+            } else {
+                self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
+            }
+        } else {
+            let hole = self.push_hole(InstHole::Char { c });
+            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+        }
+    }
+
+    fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
+        use std::mem::size_of;
+
+        assert!(!ranges.is_empty());
+        if self.compiled.uses_bytes() {
+            Ok(Some(CompileClass { c: self, ranges }.compile()?))
+        } else {
+            let ranges: Vec<(char, char)> =
+                ranges.iter().map(|r| (r.start(), r.end())).collect();
+            let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
+                self.push_hole(InstHole::Char { c: ranges[0].0 })
+            } else {
+                self.extra_inst_bytes +=
+                    ranges.len() * (size_of::<char>() * 2);
+                self.push_hole(InstHole::Ranges { ranges })
+            };
+            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+        }
+    }
+
+    fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
+        self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
+    }
+
+    fn c_class_bytes(
+        &mut self,
+        ranges: &[hir::ClassBytesRange],
+    ) -> ResultOrEmpty {
+        debug_assert!(!ranges.is_empty());
+
+        let first_split_entry = self.insts.len();
+        let mut holes = vec![];
+        let mut prev_hole = Hole::None;
+        for r in &ranges[0..ranges.len() - 1] {
+            self.fill_to_next(prev_hole);
+            let split = self.push_split_hole();
+            let next = self.insts.len();
+            self.byte_classes.set_range(r.start(), r.end());
+            holes.push(self.push_hole(InstHole::Bytes {
+                start: r.start(),
+                end: r.end(),
+            }));
+            prev_hole = self.fill_split(split, Some(next), None);
+        }
+        let next = self.insts.len();
+        let r = &ranges[ranges.len() - 1];
+        self.byte_classes.set_range(r.start(), r.end());
+        holes.push(
+            self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
+        );
+        self.fill(prev_hole, next);
+        Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
+    }
+
+    fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
+        let hole = self.push_hole(InstHole::EmptyLook { look });
+        Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+    }
+
+    fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
+    where
+        I: IntoIterator<Item = &'a Hir>,
+    {
+        let mut exprs = exprs.into_iter();
+        let Patch { mut hole, entry } = loop {
+            match exprs.next() {
+                None => return self.c_empty(),
+                Some(e) => {
+                    if let Some(p) = self.c(e)? {
+                        break p;
+                    }
+                }
+            }
+        };
+        for e in exprs {
+            if let Some(p) = self.c(e)? {
+                self.fill(hole, p.entry);
+                hole = p.hole;
+            }
+        }
+        Ok(Some(Patch { hole, entry }))
+    }
+
+    fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
+        debug_assert!(
+            exprs.len() >= 2,
+            "alternates must have at least 2 exprs"
+        );
+
+        // Initial entry point is always the first split.
+        let first_split_entry = self.insts.len();
+
+        // Save up all of the holes from each alternate. They will all get
+        // patched to point to the same location.
+        let mut holes = vec![];
+
+        // true indicates that the hole is a split where we want to fill
+        // the second branch.
+        let mut prev_hole = (Hole::None, false);
+        for e in &exprs[0..exprs.len() - 1] {
+            if prev_hole.1 {
+                let next = self.insts.len();
+                self.fill_split(prev_hole.0, None, Some(next));
+            } else {
+                self.fill_to_next(prev_hole.0);
+            }
+            let split = self.push_split_hole();
+            if let Some(Patch { hole, entry }) = self.c(e)? {
+                holes.push(hole);
+                prev_hole = (self.fill_split(split, Some(entry), None), false);
+            } else {
+                let (split1, split2) = split.dup_one();
+                holes.push(split1);
+                prev_hole = (split2, true);
+            }
+        }
+        if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
+            holes.push(hole);
+            if prev_hole.1 {
+                self.fill_split(prev_hole.0, None, Some(entry));
+            } else {
+                self.fill(prev_hole.0, entry);
+            }
+        } else {
+            // We ignore prev_hole.1. When it's true, it means we have two
+            // empty branches both pushing prev_hole.0 into holes, so both
+            // branches will go to the same place anyway.
+            holes.push(prev_hole.0);
+        }
+        Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
+    }
+
+    fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
+        use regex_syntax::hir::RepetitionKind::*;
+        match rep.kind {
+            ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
+            ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
+            OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
+            Range(hir::RepetitionRange::Exactly(min_max)) => {
+                self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
+            }
+            Range(hir::RepetitionRange::AtLeast(min)) => {
+                self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+            }
+            Range(hir::RepetitionRange::Bounded(min, max)) => {
+                self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+            }
+        }
+    }
+
+    fn c_repeat_zero_or_one(
+        &mut self,
+        expr: &Hir,
+        greedy: bool,
+    ) -> ResultOrEmpty {
+        let split_entry = self.insts.len();
+        let split = self.push_split_hole();
+        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+            Some(p) => p,
+            None => return self.pop_split_hole(),
+        };
+        let split_hole = if greedy {
+            self.fill_split(split, Some(entry_rep), None)
+        } else {
+            self.fill_split(split, None, Some(entry_rep))
+        };
+        let holes = vec![hole_rep, split_hole];
+        Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
+    }
+
+    fn c_repeat_zero_or_more(
+        &mut self,
+        expr: &Hir,
+        greedy: bool,
+    ) -> ResultOrEmpty {
+        let split_entry = self.insts.len();
+        let split = self.push_split_hole();
+        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+            Some(p) => p,
+            None => return self.pop_split_hole(),
+        };
+
+        self.fill(hole_rep, split_entry);
+        let split_hole = if greedy {
+            self.fill_split(split, Some(entry_rep), None)
+        } else {
+            self.fill_split(split, None, Some(entry_rep))
+        };
+        Ok(Some(Patch { hole: split_hole, entry: split_entry }))
+    }
+
+    fn c_repeat_one_or_more(
+        &mut self,
+        expr: &Hir,
+        greedy: bool,
+    ) -> ResultOrEmpty {
+        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+            Some(p) => p,
+            None => return Ok(None),
+        };
+        self.fill_to_next(hole_rep);
+        let split = self.push_split_hole();
+
+        let split_hole = if greedy {
+            self.fill_split(split, Some(entry_rep), None)
+        } else {
+            self.fill_split(split, None, Some(entry_rep))
+        };
+        Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
+    }
+
+    fn c_repeat_range_min_or_more(
+        &mut self,
+        expr: &Hir,
+        greedy: bool,
+        min: u32,
+    ) -> ResultOrEmpty {
+        let min = u32_to_usize(min);
+        // Using next_inst() is ok, because we can't return it (concat would
+        // have to return Some(_) while c_repeat_range_min_or_more returns
+        // None).
+        let patch_concat = self
+            .c_concat(iter::repeat(expr).take(min))?
+            .unwrap_or_else(|| self.next_inst());
+        if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
+            self.fill(patch_concat.hole, patch_rep.entry);
+            Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn c_repeat_range(
+        &mut self,
+        expr: &Hir,
+        greedy: bool,
+        min: u32,
+        max: u32,
+    ) -> ResultOrEmpty {
+        let (min, max) = (u32_to_usize(min), u32_to_usize(max));
+        debug_assert!(min <= max);
+        let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
+        if min == max {
+            return Ok(patch_concat);
+        }
+        // Same reasoning as in c_repeat_range_min_or_more (we know that min <
+        // max at this point).
+        let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
+        let initial_entry = patch_concat.entry;
+        // It is much simpler to compile, e.g., `a{2,5}` as:
+        //
+        //     aaa?a?a?
+        //
+        // But you end up with a sequence of instructions like this:
+        //
+        //     0: 'a'
+        //     1: 'a',
+        //     2: split(3, 4)
+        //     3: 'a'
+        //     4: split(5, 6)
+        //     5: 'a'
+        //     6: split(7, 8)
+        //     7: 'a'
+        //     8: MATCH
+        //
+        // This is *incredibly* inefficient because the splits end
+        // up forming a chain, which has to be resolved everything a
+        // transition is followed.
+        let mut holes = vec![];
+        let mut prev_hole = patch_concat.hole;
+        for _ in min..max {
+            self.fill_to_next(prev_hole);
+            let split = self.push_split_hole();
+            let Patch { hole, entry } = match self.c(expr)? {
+                Some(p) => p,
+                None => return self.pop_split_hole(),
+            };
+            prev_hole = hole;
+            if greedy {
+                holes.push(self.fill_split(split, Some(entry), None));
+            } else {
+                holes.push(self.fill_split(split, None, Some(entry)));
+            }
+        }
+        holes.push(prev_hole);
+        Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
+    }
+
+    /// Can be used as a default value for the c_* functions when the call to
+    /// c_function is followed by inserting at least one instruction that is
+    /// always executed after the ones written by the c* function.
+    fn next_inst(&self) -> Patch {
+        Patch { hole: Hole::None, entry: self.insts.len() }
+    }
+
+    fn fill(&mut self, hole: Hole, goto: InstPtr) {
+        match hole {
+            Hole::None => {}
+            Hole::One(pc) => {
+                self.insts[pc].fill(goto);
+            }
+            Hole::Many(holes) => {
+                for hole in holes {
+                    self.fill(hole, goto);
+                }
+            }
+        }
+    }
+
+    fn fill_to_next(&mut self, hole: Hole) {
+        let next = self.insts.len();
+        self.fill(hole, next);
+    }
+
+    fn fill_split(
+        &mut self,
+        hole: Hole,
+        goto1: Option<InstPtr>,
+        goto2: Option<InstPtr>,
+    ) -> Hole {
+        match hole {
+            Hole::None => Hole::None,
+            Hole::One(pc) => match (goto1, goto2) {
+                (Some(goto1), Some(goto2)) => {
+                    self.insts[pc].fill_split(goto1, goto2);
+                    Hole::None
+                }
+                (Some(goto1), None) => {
+                    self.insts[pc].half_fill_split_goto1(goto1);
+                    Hole::One(pc)
+                }
+                (None, Some(goto2)) => {
+                    self.insts[pc].half_fill_split_goto2(goto2);
+                    Hole::One(pc)
+                }
+                (None, None) => unreachable!(
+                    "at least one of the split \
+                     holes must be filled"
+                ),
+            },
+            Hole::Many(holes) => {
+                let mut new_holes = vec![];
+                for hole in holes {
+                    new_holes.push(self.fill_split(hole, goto1, goto2));
+                }
+                if new_holes.is_empty() {
+                    Hole::None
+                } else if new_holes.len() == 1 {
+                    new_holes.pop().unwrap()
+                } else {
+                    Hole::Many(new_holes)
+                }
+            }
+        }
+    }
+
+    fn push_compiled(&mut self, inst: Inst) {
+        self.insts.push(MaybeInst::Compiled(inst));
+    }
+
+    fn push_hole(&mut self, inst: InstHole) -> Hole {
+        let hole = self.insts.len();
+        self.insts.push(MaybeInst::Uncompiled(inst));
+        Hole::One(hole)
+    }
+
+    fn push_split_hole(&mut self) -> Hole {
+        let hole = self.insts.len();
+        self.insts.push(MaybeInst::Split);
+        Hole::One(hole)
+    }
+
+    fn pop_split_hole(&mut self) -> ResultOrEmpty {
+        self.insts.pop();
+        Ok(None)
+    }
+
+    fn check_size(&self) -> result::Result<(), Error> {
+        use std::mem::size_of;
+
+        let size =
+            self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
+        if size > self.size_limit {
+            Err(Error::CompiledTooBig(self.size_limit))
+        } else {
+            Ok(())
+        }
+    }
+}
+
+#[derive(Debug)]
+enum Hole {
+    None,
+    One(InstPtr),
+    Many(Vec<Hole>),
+}
+
+impl Hole {
+    fn dup_one(self) -> (Self, Self) {
+        match self {
+            Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
+            Hole::None | Hole::Many(_) => {
+                unreachable!("must be called on single hole")
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+enum MaybeInst {
+    Compiled(Inst),
+    Uncompiled(InstHole),
+    Split,
+    Split1(InstPtr),
+    Split2(InstPtr),
+}
+
+impl MaybeInst {
+    fn fill(&mut self, goto: InstPtr) {
+        let maybeinst = match *self {
+            MaybeInst::Split => MaybeInst::Split1(goto),
+            MaybeInst::Uncompiled(ref inst) => {
+                MaybeInst::Compiled(inst.fill(goto))
+            }
+            MaybeInst::Split1(goto1) => {
+                MaybeInst::Compiled(Inst::Split(InstSplit {
+                    goto1,
+                    goto2: goto,
+                }))
+            }
+            MaybeInst::Split2(goto2) => {
+                MaybeInst::Compiled(Inst::Split(InstSplit {
+                    goto1: goto,
+                    goto2,
+                }))
+            }
+            _ => unreachable!(
+                "not all instructions were compiled! \
+                 found uncompiled instruction: {:?}",
+                self
+            ),
+        };
+        *self = maybeinst;
+    }
+
+    fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
+        let filled = match *self {
+            MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
+            _ => unreachable!(
+                "must be called on Split instruction, \
+                 instead it was called on: {:?}",
+                self
+            ),
+        };
+        *self = MaybeInst::Compiled(filled);
+    }
+
+    fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
+        let half_filled = match *self {
+            MaybeInst::Split => goto1,
+            _ => unreachable!(
+                "must be called on Split instruction, \
+                 instead it was called on: {:?}",
+                self
+            ),
+        };
+        *self = MaybeInst::Split1(half_filled);
+    }
+
+    fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
+        let half_filled = match *self {
+            MaybeInst::Split => goto2,
+            _ => unreachable!(
+                "must be called on Split instruction, \
+                 instead it was called on: {:?}",
+                self
+            ),
+        };
+        *self = MaybeInst::Split2(half_filled);
+    }
+
+    fn unwrap(self) -> Inst {
+        match self {
+            MaybeInst::Compiled(inst) => inst,
+            _ => unreachable!(
+                "must be called on a compiled instruction, \
+                 instead it was called on: {:?}",
+                self
+            ),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+enum InstHole {
+    Save { slot: usize },
+    EmptyLook { look: EmptyLook },
+    Char { c: char },
+    Ranges { ranges: Vec<(char, char)> },
+    Bytes { start: u8, end: u8 },
+}
+
+impl InstHole {
+    fn fill(&self, goto: InstPtr) -> Inst {
+        match *self {
+            InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
+            InstHole::EmptyLook { look } => {
+                Inst::EmptyLook(InstEmptyLook { goto, look })
+            }
+            InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
+            InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
+                goto,
+                ranges: ranges.clone().into_boxed_slice(),
+            }),
+            InstHole::Bytes { start, end } => {
+                Inst::Bytes(InstBytes { goto, start, end })
+            }
+        }
+    }
+}
+
+struct CompileClass<'a, 'b> {
+    c: &'a mut Compiler,
+    ranges: &'b [hir::ClassUnicodeRange],
+}
+
+impl<'a, 'b> CompileClass<'a, 'b> {
+    fn compile(mut self) -> Result {
+        let mut holes = vec![];
+        let mut initial_entry = None;
+        let mut last_split = Hole::None;
+        let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
+        self.c.suffix_cache.clear();
+
+        for (i, range) in self.ranges.iter().enumerate() {
+            let is_last_range = i + 1 == self.ranges.len();
+            utf8_seqs.reset(range.start(), range.end());
+            let mut it = (&mut utf8_seqs).peekable();
+            loop {
+                let utf8_seq = match it.next() {
+                    None => break,
+                    Some(utf8_seq) => utf8_seq,
+                };
+                if is_last_range && it.peek().is_none() {
+                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
+                    holes.push(hole);
+                    self.c.fill(last_split, entry);
+                    last_split = Hole::None;
+                    if initial_entry.is_none() {
+                        initial_entry = Some(entry);
+                    }
+                } else {
+                    if initial_entry.is_none() {
+                        initial_entry = Some(self.c.insts.len());
+                    }
+                    self.c.fill_to_next(last_split);
+                    last_split = self.c.push_split_hole();
+                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
+                    holes.push(hole);
+                    last_split =
+                        self.c.fill_split(last_split, Some(entry), None);
+                }
+            }
+        }
+        self.c.utf8_seqs = Some(utf8_seqs);
+        Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
+    }
+
+    fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
+        if self.c.compiled.is_reverse {
+            self.c_utf8_seq_(seq)
+        } else {
+            self.c_utf8_seq_(seq.into_iter().rev())
+        }
+    }
+
+    fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
+    where
+        I: IntoIterator<Item = &'r Utf8Range>,
+    {
+        // The initial instruction for each UTF-8 sequence should be the same.
+        let mut from_inst = ::std::usize::MAX;
+        let mut last_hole = Hole::None;
+        for byte_range in seq {
+            let key = SuffixCacheKey {
+                from_inst,
+                start: byte_range.start,
+                end: byte_range.end,
+            };
+            {
+                let pc = self.c.insts.len();
+                if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
+                    from_inst = cached_pc;
+                    continue;
+                }
+            }
+            self.c.byte_classes.set_range(byte_range.start, byte_range.end);
+            if from_inst == ::std::usize::MAX {
+                last_hole = self.c.push_hole(InstHole::Bytes {
+                    start: byte_range.start,
+                    end: byte_range.end,
+                });
+            } else {
+                self.c.push_compiled(Inst::Bytes(InstBytes {
+                    goto: from_inst,
+                    start: byte_range.start,
+                    end: byte_range.end,
+                }));
+            }
+            from_inst = self.c.insts.len().checked_sub(1).unwrap();
+            debug_assert!(from_inst < ::std::usize::MAX);
+        }
+        debug_assert!(from_inst < ::std::usize::MAX);
+        Ok(Patch { hole: last_hole, entry: from_inst })
+    }
+}
+
+/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
+/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
+/// The set of byte ranges looks like this:
+///
+/// [0-7F]
+/// [C2-DF][80-BF]
+/// [E0][A0-BF][80-BF]
+/// [E1-EC][80-BF][80-BF]
+/// [ED][80-9F][80-BF]
+/// [EE-EF][80-BF][80-BF]
+///
+/// Each line above translates to one alternate in the compiled regex program.
+/// However, all but one of the alternates end in the same suffix, which is
+/// a waste of an instruction. The suffix cache facilitates reusing them across
+/// alternates.
+///
+/// Note that a HashMap could be trivially used for this, but we don't need its
+/// overhead. Some small bounded space (LRU style) is more than enough.
+///
+/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
+/// except it uses hashes as original indices and then compares full keys for
+/// validation against `dense` array.
+#[derive(Debug)]
+struct SuffixCache {
+    sparse: Box<[usize]>,
+    dense: Vec<SuffixCacheEntry>,
+}
+
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
+struct SuffixCacheEntry {
+    key: SuffixCacheKey,
+    pc: InstPtr,
+}
+
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
+struct SuffixCacheKey {
+    from_inst: InstPtr,
+    start: u8,
+    end: u8,
+}
+
+impl SuffixCache {
+    fn new(size: usize) -> Self {
+        SuffixCache {
+            sparse: vec![0usize; size].into(),
+            dense: Vec::with_capacity(size),
+        }
+    }
+
+    fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
+        let hash = self.hash(&key);
+        let pos = &mut self.sparse[hash];
+        if let Some(entry) = self.dense.get(*pos) {
+            if entry.key == key {
+                return Some(entry.pc);
+            }
+        }
+        *pos = self.dense.len();
+        self.dense.push(SuffixCacheEntry { key, pc });
+        None
+    }
+
+    fn clear(&mut self) {
+        self.dense.clear();
+    }
+
+    fn hash(&self, suffix: &SuffixCacheKey) -> usize {
+        // Basic FNV-1a hash as described:
+        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+        const FNV_PRIME: u64 = 1_099_511_628_211;
+        let mut h = 14_695_981_039_346_656_037;
+        h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
+        h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
+        h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
+        (h as usize) % self.sparse.len()
+    }
+}
+
+struct ByteClassSet([bool; 256]);
+
+impl ByteClassSet {
+    fn new() -> Self {
+        ByteClassSet([false; 256])
+    }
+
+    fn set_range(&mut self, start: u8, end: u8) {
+        debug_assert!(start <= end);
+        if start > 0 {
+            self.0[start as usize - 1] = true;
+        }
+        self.0[end as usize] = true;
+    }
+
+    fn set_word_boundary(&mut self) {
+        // We need to mark all ranges of bytes whose pairs result in
+        // evaluating \b differently.
+        let iswb = is_word_byte;
+        let mut b1: u16 = 0;
+        let mut b2: u16;
+        while b1 <= 255 {
+            b2 = b1 + 1;
+            while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
+                b2 += 1;
+            }
+            self.set_range(b1 as u8, (b2 - 1) as u8);
+            b1 = b2;
+        }
+    }
+
+    fn byte_classes(&self) -> Vec<u8> {
+        // N.B. If you're debugging the DFA, it's useful to simply return
+        // `(0..256).collect()`, which effectively removes the byte classes
+        // and makes the transitions easier to read.
+        // (0usize..256).map(|x| x as u8).collect()
+        let mut byte_classes = vec![0; 256];
+        let mut class = 0u8;
+        let mut i = 0;
+        loop {
+            byte_classes[i] = class as u8;
+            if i >= 255 {
+                break;
+            }
+            if self.0[i] {
+                class = class.checked_add(1).unwrap();
+            }
+            i += 1;
+        }
+        byte_classes
+    }
+}
+
+impl fmt::Debug for ByteClassSet {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
+    }
+}
+
+fn u32_to_usize(n: u32) -> usize {
+    // In case usize is less than 32 bits, we need to guard against overflow.
+    // On most platforms this compiles to nothing.
+    // TODO Use `std::convert::TryFrom` once it's stable.
+    if (n as u64) > (::std::usize::MAX as u64) {
+        panic!("BUG: {} is too big to be pointer sized", n)
+    }
+    n as usize
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ByteClassSet;
+
+    #[test]
+    fn byte_classes() {
+        let mut set = ByteClassSet::new();
+        set.set_range(b'a', b'z');
+        let classes = set.byte_classes();
+        assert_eq!(classes[0], 0);
+        assert_eq!(classes[1], 0);
+        assert_eq!(classes[2], 0);
+        assert_eq!(classes[b'a' as usize - 1], 0);
+        assert_eq!(classes[b'a' as usize], 1);
+        assert_eq!(classes[b'm' as usize], 1);
+        assert_eq!(classes[b'z' as usize], 1);
+        assert_eq!(classes[b'z' as usize + 1], 2);
+        assert_eq!(classes[254], 2);
+        assert_eq!(classes[255], 2);
+
+        let mut set = ByteClassSet::new();
+        set.set_range(0, 2);
+        set.set_range(4, 6);
+        let classes = set.byte_classes();
+        assert_eq!(classes[0], 0);
+        assert_eq!(classes[1], 0);
+        assert_eq!(classes[2], 0);
+        assert_eq!(classes[3], 1);
+        assert_eq!(classes[4], 2);
+        assert_eq!(classes[5], 2);
+        assert_eq!(classes[6], 2);
+        assert_eq!(classes[7], 3);
+        assert_eq!(classes[255], 3);
+    }
+
+    #[test]
+    fn full_byte_classes() {
+        let mut set = ByteClassSet::new();
+        for i in 0..256u16 {
+            set.set_range(i as u8, i as u8);
+        }
+        assert_eq!(set.byte_classes().len(), 256);
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/regex/src/compile.rs
parent	Initial commit. (diff)
download	firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip