Adding upstream version 110.0.1.upstream/110.0.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 09:22:09 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 09:22:09 +0000
commit: 43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree: 620249daf56c0258faa40cbdcf9cfba06de2a846 /js/src/irregexp/imported/regexp-parser.cc
parent: Initial commit. (diff)
download: firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz
firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip
1 files changed, 3136 insertions, 0 deletions
diff --git a/js/src/irregexp/imported/regexp-parser.cc b/js/src/irregexp/imported/regexp-parser.cc
new file mode 100644
index 0000000000..2e44dcd838
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-parser.cc
@@ -0,0 +1,3136 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-parser.h"
+
+#include "irregexp/imported/regexp-ast.h"
+#include "irregexp/imported/regexp-macro-assembler.h"
+#include "irregexp/imported/regexp.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/usetiter.h"
+#endif  // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+namespace {
+
+// Whether we're currently inside the ClassEscape production
+// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
+enum class InClassEscapeState {
+  kInClass,
+  kNotInClass,
+};
+
+// The production used to derive ClassSetOperand.
+enum class ClassSetOperandType {
+  kClassSetCharacter,
+  kClassStringDisjunction,
+  kNestedClass,
+  kCharacterClassEscape,  // \ CharacterClassEscape is a special nested class,
+                          // as we can fold it directly into another range.
+  kClassSetRange
+};
+
+class RegExpTextBuilder {
+ public:
+  using SmallRegExpTreeVector =
+      base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
+
+  RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage,
+                    RegExpFlags flags)
+      : zone_(zone),
+        flags_(flags),
+        terms_(terms_storage),
+        text_(ZoneAllocator<RegExpTree*>{zone}) {}
+  void AddCharacter(base::uc16 character);
+  void AddUnicodeCharacter(base::uc32 character);
+  void AddEscapedUnicodeCharacter(base::uc32 character);
+  void AddAtom(RegExpTree* atom);
+  void AddTerm(RegExpTree* term);
+  void AddClassRanges(RegExpClassRanges* cc);
+  void FlushPendingSurrogate();
+  void FlushText();
+  RegExpTree* PopLastAtom();
+  RegExpTree* ToRegExp();
+
+ private:
+  static const base::uc16 kNoPendingSurrogate = 0;
+
+  void AddLeadSurrogate(base::uc16 lead_surrogate);
+  void AddTrailSurrogate(base::uc16 trail_surrogate);
+  void FlushCharacters();
+  bool NeedsDesugaringForUnicode(RegExpClassRanges* cc);
+  bool NeedsDesugaringForIgnoreCase(base::uc32 c);
+  void AddClassRangesForDesugaring(base::uc32 c);
+  bool ignore_case() const { return IsIgnoreCase(flags_); }
+  bool IsUnicodeMode() const {
+    // Either /v or /u enable UnicodeMode
+    // TODO(v8:11935): Change permalink once proposal is in stage 4.
+    // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+    return IsUnicode(flags_) || IsUnicodeSets(flags_);
+  }
+  Zone* zone() const { return zone_; }
+
+  Zone* const zone_;
+  const RegExpFlags flags_;
+  ZoneList<base::uc16>* characters_ = nullptr;
+  base::uc16 pending_surrogate_ = kNoPendingSurrogate;
+  SmallRegExpTreeVector* terms_;
+  SmallRegExpTreeVector text_;
+};
+
+void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
+  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+  FlushPendingSurrogate();
+  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+  pending_surrogate_ = lead_surrogate;
+}
+
+void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
+  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+  if (pending_surrogate_ != kNoPendingSurrogate) {
+    base::uc16 lead_surrogate = pending_surrogate_;
+    pending_surrogate_ = kNoPendingSurrogate;
+    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+    base::uc32 combined =
+        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+    if (NeedsDesugaringForIgnoreCase(combined)) {
+      AddClassRangesForDesugaring(combined);
+    } else {
+      ZoneList<base::uc16> surrogate_pair(2, zone());
+      surrogate_pair.Add(lead_surrogate, zone());
+      surrogate_pair.Add(trail_surrogate, zone());
+      RegExpAtom* atom =
+          zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
+      AddAtom(atom);
+    }
+  } else {
+    pending_surrogate_ = trail_surrogate;
+    FlushPendingSurrogate();
+  }
+}
+
+void RegExpTextBuilder::FlushPendingSurrogate() {
+  if (pending_surrogate_ != kNoPendingSurrogate) {
+    DCHECK(IsUnicodeMode());
+    base::uc32 c = pending_surrogate_;
+    pending_surrogate_ = kNoPendingSurrogate;
+    AddClassRangesForDesugaring(c);
+  }
+}
+
+void RegExpTextBuilder::FlushCharacters() {
+  FlushPendingSurrogate();
+  if (characters_ != nullptr) {
+    RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
+    characters_ = nullptr;
+    text_.emplace_back(atom);
+  }
+}
+
+void RegExpTextBuilder::FlushText() {
+  FlushCharacters();
+  size_t num_text = text_.size();
+  if (num_text == 0) {
+    return;
+  } else if (num_text == 1) {
+    terms_->emplace_back(text_.back());
+  } else {
+    RegExpText* text = zone()->New<RegExpText>(zone());
+    for (size_t i = 0; i < num_text; i++) {
+      text_[i]->AppendToText(text, zone());
+    }
+    terms_->emplace_back(text);
+  }
+  text_.clear();
+}
+
+void RegExpTextBuilder::AddCharacter(base::uc16 c) {
+  FlushPendingSurrogate();
+  if (NeedsDesugaringForIgnoreCase(c)) {
+    AddClassRangesForDesugaring(c);
+  } else {
+    if (characters_ == nullptr) {
+      characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
+    }
+    characters_->Add(c, zone());
+  }
+}
+
+void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) {
+  if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+    DCHECK(IsUnicodeMode());
+    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+  } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+    AddLeadSurrogate(c);
+  } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+    AddTrailSurrogate(c);
+  } else {
+    AddCharacter(static_cast<base::uc16>(c));
+  }
+}
+
+void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
+  // A lead or trail surrogate parsed via escape sequence will not
+  // pair up with any preceding lead or following trail surrogate.
+  FlushPendingSurrogate();
+  AddUnicodeCharacter(character);
+  FlushPendingSurrogate();
+}
+
+void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) {
+  if (NeedsDesugaringForUnicode(cr)) {
+    // With /u or /v, character class needs to be desugared, so it
+    // must be a standalone term instead of being part of a RegExpText.
+    AddTerm(cr);
+  } else {
+    AddAtom(cr);
+  }
+}
+
+void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) {
+  AddTerm(zone()->New<RegExpClassRanges>(
+      zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
+}
+
+void RegExpTextBuilder::AddAtom(RegExpTree* atom) {
+  DCHECK(atom->IsTextElement());
+  FlushCharacters();
+  text_.emplace_back(atom);
+}
+
+void RegExpTextBuilder::AddTerm(RegExpTree* term) {
+  DCHECK(term->IsTextElement());
+  FlushText();
+  terms_->emplace_back(term);
+}
+
+bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) {
+  if (!IsUnicodeMode()) return false;
+  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
+  // necessarily mean that we need to desugar. It's probably nicer to have a
+  // separate pass to figure out unicode desugarings.
+  if (ignore_case()) return true;
+  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+  CharacterRange::Canonicalize(ranges);
+
+  if (cc->is_negated()) {
+    ZoneList<CharacterRange>* negated_ranges =
+        zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());
+    CharacterRange::Negate(ranges, negated_ranges, zone());
+    ranges = negated_ranges;
+  }
+
+  for (int i = ranges->length() - 1; i >= 0; i--) {
+    base::uc32 from = ranges->at(i).from();
+    base::uc32 to = ranges->at(i).to();
+    // Check for non-BMP characters.
+    if (to >= kNonBmpStart) return true;
+    // Check for lone surrogates.
+    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+  }
+  return false;
+}
+
+bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
+#ifdef V8_INTL_SUPPORT
+  if (IsUnicodeMode() && ignore_case()) {
+    icu::UnicodeSet set(c, c);
+    set.closeOver(USET_CASE_INSENSITIVE);
+    set.removeAllStrings();
+    return set.size() > 1;
+  }
+  // In the case where ICU is not included, we act as if the unicode flag is
+  // not set, and do not desugar.
+#endif  // V8_INTL_SUPPORT
+  return false;
+}
+
+RegExpTree* RegExpTextBuilder::PopLastAtom() {
+  FlushPendingSurrogate();
+  RegExpTree* atom;
+  if (characters_ != nullptr) {
+    base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
+    int num_chars = char_vector.length();
+    if (num_chars > 1) {
+      base::Vector<const base::uc16> prefix =
+          char_vector.SubVector(0, num_chars - 1);
+      text_.emplace_back(zone()->New<RegExpAtom>(prefix));
+      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
+    }
+    characters_ = nullptr;
+    atom = zone()->New<RegExpAtom>(char_vector);
+    return atom;
+  } else if (text_.size() > 0) {
+    atom = text_.back();
+    text_.pop_back();
+    return atom;
+  }
+  return nullptr;
+}
+
+RegExpTree* RegExpTextBuilder::ToRegExp() {
+  FlushText();
+  size_t num_alternatives = terms_->size();
+  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
+  if (num_alternatives == 1) return terms_->back();
+  return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
+      base::VectorOf(terms_->begin(), terms_->size()), zone()));
+}
+
+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
+class RegExpBuilder {
+ public:
+  RegExpBuilder(Zone* zone, RegExpFlags flags)
+      : zone_(zone),
+        flags_(flags),
+        terms_(ZoneAllocator<RegExpTree*>{zone}),
+        alternatives_(ZoneAllocator<RegExpTree*>{zone}),
+        text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {}
+  void AddCharacter(base::uc16 character);
+  void AddUnicodeCharacter(base::uc32 character);
+  void AddEscapedUnicodeCharacter(base::uc32 character);
+  // "Adds" an empty expression. Does nothing except consume a
+  // following quantifier
+  void AddEmpty();
+  void AddClassRanges(RegExpClassRanges* cc);
+  void AddAtom(RegExpTree* tree);
+  void AddTerm(RegExpTree* tree);
+  void AddAssertion(RegExpTree* tree);
+  void NewAlternative();  // '|'
+  bool AddQuantifierToAtom(int min, int max,
+                           RegExpQuantifier::QuantifierType type);
+  void FlushText();
+  RegExpTree* ToRegExp();
+  RegExpFlags flags() const { return flags_; }
+
+  bool ignore_case() const { return IsIgnoreCase(flags_); }
+  bool multiline() const { return IsMultiline(flags_); }
+  bool dotall() const { return IsDotAll(flags_); }
+
+ private:
+  void FlushTerms();
+  bool IsUnicodeMode() const {
+    // Either /v or /u enable UnicodeMode
+    // TODO(v8:11935): Change permalink once proposal is in stage 4.
+    // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+    return IsUnicode(flags_) || IsUnicodeSets(flags_);
+  }
+  Zone* zone() const { return zone_; }
+  RegExpTextBuilder& text_builder() { return text_builder_; }
+
+  Zone* const zone_;
+  bool pending_empty_ = false;
+  const RegExpFlags flags_;
+
+  using SmallRegExpTreeVector =
+      base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
+  SmallRegExpTreeVector terms_;
+  SmallRegExpTreeVector alternatives_;
+  RegExpTextBuilder text_builder_;
+};
+
+enum SubexpressionType {
+  INITIAL,
+  CAPTURE,  // All positive values represent captures.
+  POSITIVE_LOOKAROUND,
+  NEGATIVE_LOOKAROUND,
+  GROUPING
+};
+
+class RegExpParserState : public ZoneObject {
+ public:
+  // Push a state on the stack.
+  RegExpParserState(RegExpParserState* previous_state,
+                    SubexpressionType group_type,
+                    RegExpLookaround::Type lookaround_type,
+                    int disjunction_capture_index,
+                    const ZoneVector<base::uc16>* capture_name,
+                    RegExpFlags flags, Zone* zone)
+      : previous_state_(previous_state),
+        builder_(zone, flags),
+        group_type_(group_type),
+        lookaround_type_(lookaround_type),
+        disjunction_capture_index_(disjunction_capture_index),
+        capture_name_(capture_name) {}
+  // Parser state of containing expression, if any.
+  RegExpParserState* previous_state() const { return previous_state_; }
+  bool IsSubexpression() { return previous_state_ != nullptr; }
+  // RegExpBuilder building this regexp's AST.
+  RegExpBuilder* builder() { return &builder_; }
+  // Type of regexp being parsed (parenthesized group or entire regexp).
+  SubexpressionType group_type() const { return group_type_; }
+  // Lookahead or Lookbehind.
+  RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
+  // Index in captures array of first capture in this sub-expression, if any.
+  // Also the capture index of this sub-expression itself, if group_type
+  // is CAPTURE.
+  int capture_index() const { return disjunction_capture_index_; }
+  // The name of the current sub-expression, if group_type is CAPTURE. Only
+  // used for named captures.
+  const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
+
+  bool IsNamedCapture() const { return capture_name_ != nullptr; }
+
+  // Check whether the parser is inside a capture group with the given index.
+  bool IsInsideCaptureGroup(int index) const {
+    for (const RegExpParserState* s = this; s != nullptr;
+         s = s->previous_state()) {
+      if (s->group_type() != CAPTURE) continue;
+      // Return true if we found the matching capture index.
+      if (index == s->capture_index()) return true;
+      // Abort if index is larger than what has been parsed up till this state.
+      if (index > s->capture_index()) return false;
+    }
+    return false;
+  }
+
+  // Check whether the parser is inside a capture group with the given name.
+  bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
+    DCHECK_NOT_NULL(name);
+    for (const RegExpParserState* s = this; s != nullptr;
+         s = s->previous_state()) {
+      if (s->capture_name() == nullptr) continue;
+      if (*s->capture_name() == *name) return true;
+    }
+    return false;
+  }
+
+ private:
+  // Linked list implementation of stack of states.
+  RegExpParserState* const previous_state_;
+  // Builder for the stored disjunction.
+  RegExpBuilder builder_;
+  // Stored disjunction type (capture, look-ahead or grouping), if any.
+  const SubexpressionType group_type_;
+  // Stored read direction.
+  const RegExpLookaround::Type lookaround_type_;
+  // Stored disjunction's capture index (if any).
+  const int disjunction_capture_index_;
+  // Stored capture name (if any).
+  const ZoneVector<base::uc16>* const capture_name_;
+};
+
+template <class CharT>
+class RegExpParserImpl final {
+ private:
+  RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
+                   uintptr_t stack_limit, Zone* zone,
+                   const DisallowGarbageCollection& no_gc);
+
+  bool Parse(RegExpCompileData* result);
+
+  RegExpTree* ParsePattern();
+  RegExpTree* ParseDisjunction();
+  RegExpTree* ParseGroup();
+
+  // Parses a {...,...} quantifier and stores the range in the given
+  // out parameters.
+  bool ParseIntervalQuantifier(int* min_out, int* max_out);
+
+  // Checks whether the following is a length-digit hexadecimal number,
+  // and sets the value if it is.
+  bool ParseHexEscape(int length, base::uc32* value);
+  bool ParseUnicodeEscape(base::uc32* value);
+  bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
+
+  bool ParsePropertyClassName(ZoneVector<char>* name_1,
+                              ZoneVector<char>* name_2);
+  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range,
+                             CharacterClassStrings* add_to_strings, bool negate,
+                             const ZoneVector<char>& name_1,
+                             const ZoneVector<char>& name_2);
+
+  RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges,
+                               bool add_unicode_case_equivalents);
+  // Parse inside a class. Either add escaped class to the range, or return
+  // false and pass parsed single character through |char_out|.
+  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
+                        bool add_unicode_case_equivalents, base::uc32* char_out,
+                        bool* is_class_escape);
+  // Returns true iff parsing was successful.
+  bool TryParseCharacterClassEscape(base::uc32 next,
+                                    InClassEscapeState in_class_escape_state,
+                                    ZoneList<CharacterRange>* ranges,
+                                    CharacterClassStrings* strings, Zone* zone,
+                                    bool add_unicode_case_equivalents);
+  RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,
+                                          CharacterClassStrings* strings);
+  RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
+                                   ClassSetOperandType* type_out);
+  RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
+                                   ClassSetOperandType* type_out,
+                                   ZoneList<CharacterRange>* ranges,
+                                   CharacterClassStrings* strings);
+  base::uc32 ParseClassSetCharacter();
+  // Parses and returns a single escaped character.
+  base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
+                                  bool* is_escaped_unicode_character);
+
+  RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,
+                              RegExpTree* first_operand,
+                              ClassSetOperandType first_operand_type,
+                              ZoneList<CharacterRange>* ranges,
+                              CharacterClassStrings* strings);
+  RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,
+                                     bool is_negated, RegExpTree* first_operand,
+                                     ClassSetOperandType first_operand_type);
+  RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder,
+                                    bool is_negated, RegExpTree* first_operand,
+                                    ClassSetOperandType first_operand_type);
+  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
+
+  base::uc32 ParseOctalLiteral();
+
+  // Tries to parse the input as a back reference.  If successful it
+  // stores the result in the output parameter and returns true.  If
+  // it fails it will push back the characters read so the same characters
+  // can be reparsed.
+  bool ParseBackReferenceIndex(int* index_out);
+
+  RegExpTree* ReportError(RegExpError error);
+  void Advance();
+  void Advance(int dist);
+  void RewindByOneCodepoint();  // Rewinds to before the previous Advance().
+  void Reset(int pos);
+
+  // Reports whether the pattern might be used as a literal search string.
+  // Only use if the result of the parse is a single atom node.
+  bool simple() const { return simple_; }
+  bool contains_anchor() const { return contains_anchor_; }
+  void set_contains_anchor() { contains_anchor_ = true; }
+  int captures_started() const { return captures_started_; }
+  int position() const { return next_pos_ - 1; }
+  bool failed() const { return failed_; }
+  RegExpFlags flags() const { return top_level_flags_; }
+  bool IsUnicodeMode() const {
+    // Either /v or /u enable UnicodeMode
+    // TODO(v8:11935): Change permalink once proposal is in stage 4.
+    // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+    return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;
+  }
+  bool unicode_sets() const { return IsUnicodeSets(flags()); }
+  bool ignore_case() const { return IsIgnoreCase(flags()); }
+
+  static bool IsSyntaxCharacterOrSlash(base::uc32 c);
+  static bool IsClassSetSyntaxCharacter(base::uc32 c);
+  static bool IsClassSetReservedPunctuator(base::uc32 c);
+  bool IsClassSetReservedDoublePunctuator(base::uc32 c);
+
+  static const base::uc32 kEndMarker = (1 << 21);
+
+ private:
+  // Return the 1-indexed RegExpCapture object, allocate if necessary.
+  RegExpCapture* GetCapture(int index);
+
+  // Creates a new named capture at the specified index. Must be called exactly
+  // once for each named capture. Fails if a capture with the same name is
+  // encountered.
+  bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
+
+  // Parses the name of a capture group (?<name>pattern). The name must adhere
+  // to IdentifierName in the ECMAScript standard.
+  const ZoneVector<base::uc16>* ParseCaptureGroupName();
+
+  bool ParseNamedBackReference(RegExpBuilder* builder,
+                               RegExpParserState* state);
+  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
+
+  // After the initial parsing pass, patch corresponding RegExpCapture objects
+  // into all RegExpBackReferences. This is done after initial parsing in order
+  // to avoid complicating cases in which references comes before the capture.
+  void PatchNamedBackReferences();
+
+  ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
+
+  // Returns true iff the pattern contains named captures. May call
+  // ScanForCaptures to look ahead at the remaining pattern.
+  bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
+
+  Zone* zone() const { return zone_; }
+
+  base::uc32 current() const { return current_; }
+  bool has_more() const { return has_more_; }
+  bool has_next() const { return next_pos_ < input_length(); }
+  base::uc32 Next();
+  template <bool update_position>
+  base::uc32 ReadNext();
+  CharT InputAt(int index) const {
+    DCHECK(0 <= index && index < input_length());
+    return input_[index];
+  }
+  int input_length() const { return input_length_; }
+  void ScanForCaptures(InClassEscapeState in_class_escape_state);
+
+  struct RegExpCaptureNameLess {
+    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
+      DCHECK_NOT_NULL(lhs);
+      DCHECK_NOT_NULL(rhs);
+      return *lhs->name() < *rhs->name();
+    }
+  };
+
+  class ForceUnicodeScope final {
+   public:
+    explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
+        : parser_(parser) {
+      DCHECK(!parser_->force_unicode_);
+      parser_->force_unicode_ = true;
+    }
+    ~ForceUnicodeScope() {
+      DCHECK(parser_->force_unicode_);
+      parser_->force_unicode_ = false;
+    }
+
+   private:
+    RegExpParserImpl<CharT>* const parser_;
+  };
+
+  const DisallowGarbageCollection no_gc_;
+  Zone* const zone_;
+  RegExpError error_ = RegExpError::kNone;
+  int error_pos_ = 0;
+  ZoneList<RegExpCapture*>* captures_;
+  ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
+  ZoneList<RegExpBackReference*>* named_back_references_;
+  const CharT* const input_;
+  const int input_length_;
+  base::uc32 current_;
+  const RegExpFlags top_level_flags_;
+  bool force_unicode_ = false;  // Force parser to act as if unicode were set.
+  int next_pos_;
+  int captures_started_;
+  int capture_count_;  // Only valid after we have scanned for captures.
+  bool has_more_;
+  bool simple_;
+  bool contains_anchor_;
+  bool is_scanned_for_captures_;
+  bool has_named_captures_;  // Only valid after we have scanned for captures.
+  bool failed_;
+  const uintptr_t stack_limit_;
+
+  friend class v8::internal::RegExpParser;
+};
+
+template <class CharT>
+RegExpParserImpl<CharT>::RegExpParserImpl(
+    const CharT* input, int input_length, RegExpFlags flags,
+    uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
+    : zone_(zone),
+      captures_(nullptr),
+      named_captures_(nullptr),
+      named_back_references_(nullptr),
+      input_(input),
+      input_length_(input_length),
+      current_(kEndMarker),
+      top_level_flags_(flags),
+      next_pos_(0),
+      captures_started_(0),
+      capture_count_(0),
+      has_more_(true),
+      simple_(false),
+      contains_anchor_(false),
+      is_scanned_for_captures_(false),
+      has_named_captures_(false),
+      failed_(false),
+      stack_limit_(stack_limit) {
+  Advance();
+}
+
+template <>
+template <bool update_position>
+inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
+  int position = next_pos_;
+  base::uc16 c0 = InputAt(position);
+  position++;
+  DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
+  if (update_position) next_pos_ = position;
+  return c0;
+}
+
+template <>
+template <bool update_position>
+inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
+  int position = next_pos_;
+  base::uc16 c0 = InputAt(position);
+  base::uc32 result = c0;
+  position++;
+  // Read the whole surrogate pair in case of unicode mode, if possible.
+  if (IsUnicodeMode() && position < input_length() &&
+      unibrow::Utf16::IsLeadSurrogate(c0)) {
+    base::uc16 c1 = InputAt(position);
+    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+      result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
+      position++;
+    }
+  }
+  if (update_position) next_pos_ = position;
+  return result;
+}
+
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::Next() {
+  if (has_next()) {
+    return ReadNext<false>();
+  } else {
+    return kEndMarker;
+  }
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::Advance() {
+  if (has_next()) {
+    if (GetCurrentStackPosition() < stack_limit_) {
+      if (v8_flags.correctness_fuzzer_suppressions) {
+        FATAL("Aborting on stack overflow");
+      }
+      ReportError(RegExpError::kStackOverflow);
+    } else {
+      current_ = ReadNext<true>();
+    }
+  } else {
+    current_ = kEndMarker;
+    // Advance so that position() points to 1-after-the-last-character. This is
+    // important so that Reset() to this position works correctly.
+    next_pos_ = input_length() + 1;
+    has_more_ = false;
+  }
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
+  if (!has_more()) return;
+  // Rewinds by one code point, i.e.: two code units if `current` is outside
+  // the basic multilingual plane (= composed of a lead and trail surrogate),
+  // or one code unit otherwise.
+  const int rewind_by =
+      current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
+  Advance(rewind_by);  // Undo the last Advance.
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::Reset(int pos) {
+  next_pos_ = pos;
+  has_more_ = (pos < input_length());
+  Advance();
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::Advance(int dist) {
+  next_pos_ += dist - 1;
+  Advance();
+}
+
+// static
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+// static
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) {
+  switch (c) {
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '/':
+    case '-':
+    case '\\':
+    case '|':
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+// static
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) {
+  switch (c) {
+    case '&':
+    case '-':
+    case '!':
+    case '#':
+    case '%':
+    case ',':
+    case ':':
+    case ';':
+    case '<':
+    case '=':
+    case '>':
+    case '@':
+    case '`':
+    case '~':
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) {
+#define DOUBLE_PUNCTUATOR_CASE(Char) \
+  case Char:                         \
+    return Next() == Char
+
+  switch (c) {
+    DOUBLE_PUNCTUATOR_CASE('&');
+    DOUBLE_PUNCTUATOR_CASE('!');
+    DOUBLE_PUNCTUATOR_CASE('#');
+    DOUBLE_PUNCTUATOR_CASE('$');
+    DOUBLE_PUNCTUATOR_CASE('%');
+    DOUBLE_PUNCTUATOR_CASE('*');
+    DOUBLE_PUNCTUATOR_CASE('+');
+    DOUBLE_PUNCTUATOR_CASE(',');
+    DOUBLE_PUNCTUATOR_CASE('.');
+    DOUBLE_PUNCTUATOR_CASE(':');
+    DOUBLE_PUNCTUATOR_CASE(';');
+    DOUBLE_PUNCTUATOR_CASE('<');
+    DOUBLE_PUNCTUATOR_CASE('=');
+    DOUBLE_PUNCTUATOR_CASE('>');
+    DOUBLE_PUNCTUATOR_CASE('?');
+    DOUBLE_PUNCTUATOR_CASE('@');
+    DOUBLE_PUNCTUATOR_CASE('^');
+    DOUBLE_PUNCTUATOR_CASE('`');
+    DOUBLE_PUNCTUATOR_CASE('~');
+    default:
+      break;
+  }
+#undef DOUBLE_PUNCTUATOR_CASE
+
+  return false;
+}
+
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
+  if (failed_) return nullptr;  // Do not overwrite any existing error.
+  failed_ = true;
+  error_ = error;
+  error_pos_ = position();
+  // Zip to the end to make sure no more input is read.
+  current_ = kEndMarker;
+  next_pos_ = input_length();
+  has_more_ = false;
+  return nullptr;
+}
+
+#define CHECK_FAILED /**/);    \
+  if (failed_) return nullptr; \
+  ((void)0
+
+// Pattern ::
+//   Disjunction
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
+  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
+  PatchNamedBackReferences(CHECK_FAILED);
+  DCHECK(!has_more());
+  // If the result of parsing is a literal string atom, and it has the
+  // same length as the input, then the atom is identical to the input.
+  if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
+    simple_ = true;
+  }
+  return result;
+}
+
+// Disjunction ::
+//   Alternative
+//   Alternative | Disjunction
+// Alternative ::
+//   [empty]
+//   Term Alternative
+// Term ::
+//   Assertion
+//   Atom
+//   Atom Quantifier
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
+  // Used to store current state while parsing subexpressions.
+  RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
+                                  0, nullptr, flags(), zone());
+  RegExpParserState* state = &initial_state;
+  // Cache the builder in a local variable for quick access.
+  RegExpBuilder* builder = initial_state.builder();
+  while (true) {
+    switch (current()) {
+      case kEndMarker:
+        if (failed()) return nullptr;  // E.g. the initial Advance failed.
+        if (state->IsSubexpression()) {
+          // Inside a parenthesized group when hitting end of input.
+          return ReportError(RegExpError::kUnterminatedGroup);
+        }
+        DCHECK_EQ(INITIAL, state->group_type());
+        // Parsing completed successfully.
+        return builder->ToRegExp();
+      case ')': {
+        if (!state->IsSubexpression()) {
+          return ReportError(RegExpError::kUnmatchedParen);
+        }
+        DCHECK_NE(INITIAL, state->group_type());
+
+        Advance();
+        // End disjunction parsing and convert builder content to new single
+        // regexp atom.
+        RegExpTree* body = builder->ToRegExp();
+
+        int end_capture_index = captures_started();
+
+        int capture_index = state->capture_index();
+        SubexpressionType group_type = state->group_type();
+
+        // Build result of subexpression.
+        if (group_type == CAPTURE) {
+          if (state->IsNamedCapture()) {
+            CreateNamedCaptureAtIndex(state->capture_name(),
+                                      capture_index CHECK_FAILED);
+          }
+          RegExpCapture* capture = GetCapture(capture_index);
+          capture->set_body(body);
+          body = capture;
+        } else if (group_type == GROUPING) {
+          body = zone()->template New<RegExpGroup>(body);
+        } else {
+          DCHECK(group_type == POSITIVE_LOOKAROUND ||
+                 group_type == NEGATIVE_LOOKAROUND);
+          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
+          body = zone()->template New<RegExpLookaround>(
+              body, is_positive, end_capture_index - capture_index,
+              capture_index, state->lookaround_type());
+        }
+
+        // Restore previous state.
+        state = state->previous_state();
+        builder = state->builder();
+
+        builder->AddAtom(body);
+        // For compatibility with JSC and ES3, we allow quantifiers after
+        // lookaheads, and break in all cases.
+        break;
+      }
+      case '|': {
+        Advance();
+        builder->NewAlternative();
+        continue;
+      }
+      case '*':
+      case '+':
+      case '?':
+        return ReportError(RegExpError::kNothingToRepeat);
+      case '^': {
+        Advance();
+        builder->AddAssertion(zone()->template New<RegExpAssertion>(
+            builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
+                                 : RegExpAssertion::Type::START_OF_INPUT));
+        set_contains_anchor();
+        continue;
+      }
+      case '$': {
+        Advance();
+        RegExpAssertion::Type assertion_type =
+            builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
+                                 : RegExpAssertion::Type::END_OF_INPUT;
+        builder->AddAssertion(
+            zone()->template New<RegExpAssertion>(assertion_type));
+        continue;
+      }
+      case '.': {
+        Advance();
+        ZoneList<CharacterRange>* ranges =
+            zone()->template New<ZoneList<CharacterRange>>(2, zone());
+
+        if (builder->dotall()) {
+          // Everything.
+          CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
+                                         ranges, false, zone());
+        } else {
+          // Everything except \x0A, \x0D, \u2028 and \u2029.
+          CharacterRange::AddClassEscape(
+              StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
+        }
+
+        RegExpClassRanges* cc =
+            zone()->template New<RegExpClassRanges>(zone(), ranges);
+        builder->AddClassRanges(cc);
+        break;
+      }
+      case '(': {
+        state = ParseOpenParenthesis(state CHECK_FAILED);
+        builder = state->builder();
+        continue;
+      }
+      case '[': {
+        RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
+        if (cc->IsClassRanges()) {
+          builder->AddClassRanges(cc->AsClassRanges());
+        } else {
+          DCHECK(cc->IsClassSetExpression());
+          builder->AddTerm(cc);
+        }
+        break;
+      }
+      // Atom ::
+      //   \ AtomEscape
+      case '\\':
+        switch (Next()) {
+          case kEndMarker:
+            return ReportError(RegExpError::kEscapeAtEndOfPattern);
+          // AtomEscape ::
+          //   [+UnicodeMode] DecimalEscape
+          //   [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
+          //                  of DecimalEscape is ≤ NcapturingParens
+          //   CharacterEscape (some cases of this mixed in too)
+          //
+          // TODO(jgruber): It may make sense to disentangle all the different
+          // cases and make the structure mirror the spec, e.g. for AtomEscape:
+          //
+          //  if (TryParseDecimalEscape(...)) return;
+          //  if (TryParseCharacterClassEscape(...)) return;
+          //  if (TryParseCharacterEscape(...)) return;
+          //  if (TryParseGroupName(...)) return;
+          case '1':
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9': {
+            int index = 0;
+            const bool is_backref =
+                ParseBackReferenceIndex(&index CHECK_FAILED);
+            if (is_backref) {
+              if (state->IsInsideCaptureGroup(index)) {
+                // The back reference is inside the capture group it refers to.
+                // Nothing can possibly have been captured yet, so we use empty
+                // instead. This ensures that, when checking a back reference,
+                // the capture registers of the referenced capture are either
+                // both set or both cleared.
+                builder->AddEmpty();
+              } else {
+                RegExpCapture* capture = GetCapture(index);
+                RegExpTree* atom = zone()->template New<RegExpBackReference>(
+                    capture, builder->flags());
+                builder->AddAtom(atom);
+              }
+              break;
+            }
+            // With /u and /v, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (IsUnicodeMode()) {
+              return ReportError(RegExpError::kInvalidEscape);
+            }
+            base::uc32 first_digit = Next();
+            if (first_digit == '8' || first_digit == '9') {
+              builder->AddCharacter(first_digit);
+              Advance(2);
+              break;
+            }
+            V8_FALLTHROUGH;
+          }
+          case '0': {
+            Advance();
+            if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') {
+              // Decimal escape with leading 0 are not parsed as octal.
+              return ReportError(RegExpError::kInvalidDecimalEscape);
+            }
+            base::uc32 octal = ParseOctalLiteral();
+            builder->AddCharacter(octal);
+            break;
+          }
+          case 'b':
+            Advance(2);
+            builder->AddAssertion(zone()->template New<RegExpAssertion>(
+                RegExpAssertion::Type::BOUNDARY));
+            continue;
+          case 'B':
+            Advance(2);
+            builder->AddAssertion(zone()->template New<RegExpAssertion>(
+                RegExpAssertion::Type::NON_BOUNDARY));
+            continue;
+          // AtomEscape ::
+          //   CharacterClassEscape
+          case 'd':
+          case 'D':
+          case 's':
+          case 'S':
+          case 'w':
+          case 'W': {
+            base::uc32 next = Next();
+            ZoneList<CharacterRange>* ranges =
+                zone()->template New<ZoneList<CharacterRange>>(2, zone());
+            bool add_unicode_case_equivalents =
+                IsUnicodeMode() && ignore_case();
+            bool parsed_character_class_escape = TryParseCharacterClassEscape(
+                next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(),
+                add_unicode_case_equivalents CHECK_FAILED);
+
+            if (parsed_character_class_escape) {
+              RegExpClassRanges* cc =
+                  zone()->template New<RegExpClassRanges>(zone(), ranges);
+              builder->AddClassRanges(cc);
+            } else {
+              CHECK(!IsUnicodeMode());
+              Advance(2);
+              builder->AddCharacter(next);  // IdentityEscape.
+            }
+            break;
+          }
+          case 'p':
+          case 'P': {
+            base::uc32 next = Next();
+            ZoneList<CharacterRange>* ranges =
+                zone()->template New<ZoneList<CharacterRange>>(2, zone());
+            CharacterClassStrings* strings = nullptr;
+            if (unicode_sets()) {
+              strings = zone()->template New<CharacterClassStrings>(zone());
+            }
+            bool add_unicode_case_equivalents = ignore_case();
+            bool parsed_character_class_escape = TryParseCharacterClassEscape(
+                next, InClassEscapeState::kNotInClass, ranges, strings, zone(),
+                add_unicode_case_equivalents CHECK_FAILED);
+
+            if (parsed_character_class_escape) {
+              if (unicode_sets()) {
+                RegExpClassSetOperand* op =
+                    zone()->template New<RegExpClassSetOperand>(ranges,
+                                                                strings);
+                builder->AddTerm(op);
+              } else {
+                RegExpClassRanges* cc =
+                    zone()->template New<RegExpClassRanges>(zone(), ranges);
+                builder->AddClassRanges(cc);
+              }
+            } else {
+              CHECK(!IsUnicodeMode());
+              Advance(2);
+              builder->AddCharacter(next);  // IdentityEscape.
+            }
+            break;
+          }
+          // AtomEscape ::
+          //   k GroupName
+          case 'k': {
+            // Either an identity escape or a named back-reference.  The two
+            // interpretations are mutually exclusive: '\k' is interpreted as
+            // an identity escape for non-Unicode patterns without named
+            // capture groups, and as the beginning of a named back-reference
+            // in all other cases.
+            const bool has_named_captures =
+                HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
+            if (IsUnicodeMode() || has_named_captures) {
+              Advance(2);
+              ParseNamedBackReference(builder, state CHECK_FAILED);
+              break;
+            }
+          }
+            V8_FALLTHROUGH;
+          // AtomEscape ::
+          //   CharacterEscape
+          default: {
+            bool is_escaped_unicode_character = false;
+            base::uc32 c = ParseCharacterEscape(
+                InClassEscapeState::kNotInClass,
+                &is_escaped_unicode_character CHECK_FAILED);
+            if (is_escaped_unicode_character) {
+              builder->AddEscapedUnicodeCharacter(c);
+            } else {
+              builder->AddCharacter(c);
+            }
+            break;
+          }
+        }
+        break;
+      case '{': {
+        int dummy;
+        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
+        if (parsed) return ReportError(RegExpError::kNothingToRepeat);
+        V8_FALLTHROUGH;
+      }
+      case '}':
+      case ']':
+        if (IsUnicodeMode()) {
+          return ReportError(RegExpError::kLoneQuantifierBrackets);
+        }
+        V8_FALLTHROUGH;
+      default:
+        builder->AddUnicodeCharacter(current());
+        Advance();
+        break;
+    }  // end switch(current())
+
+    int min;
+    int max;
+    switch (current()) {
+      // QuantifierPrefix ::
+      //   *
+      //   +
+      //   ?
+      //   {
+      case '*':
+        min = 0;
+        max = RegExpTree::kInfinity;
+        Advance();
+        break;
+      case '+':
+        min = 1;
+        max = RegExpTree::kInfinity;
+        Advance();
+        break;
+      case '?':
+        min = 0;
+        max = 1;
+        Advance();
+        break;
+      case '{':
+        if (ParseIntervalQuantifier(&min, &max)) {
+          if (max < min) {
+            return ReportError(RegExpError::kRangeOutOfOrder);
+          }
+          break;
+        } else if (IsUnicodeMode()) {
+          // Incomplete quantifiers are not allowed.
+          return ReportError(RegExpError::kIncompleteQuantifier);
+        }
+        continue;
+      default:
+        continue;
+    }
+    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
+    if (current() == '?') {
+      quantifier_type = RegExpQuantifier::NON_GREEDY;
+      Advance();
+    } else if (v8_flags.regexp_possessive_quantifier && current() == '+') {
+      // v8_flags.regexp_possessive_quantifier is a debug-only flag.
+      quantifier_type = RegExpQuantifier::POSSESSIVE;
+      Advance();
+    }
+    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+      return ReportError(RegExpError::kInvalidQuantifier);
+    }
+  }
+}
+
+template <class CharT>
+RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
+    RegExpParserState* state) {
+  RegExpLookaround::Type lookaround_type = state->lookaround_type();
+  bool is_named_capture = false;
+  const ZoneVector<base::uc16>* capture_name = nullptr;
+  SubexpressionType subexpr_type = CAPTURE;
+  Advance();
+  if (current() == '?') {
+    switch (Next()) {
+      case ':':
+        Advance(2);
+        subexpr_type = GROUPING;
+        break;
+      case '=':
+        Advance(2);
+        lookaround_type = RegExpLookaround::LOOKAHEAD;
+        subexpr_type = POSITIVE_LOOKAROUND;
+        break;
+      case '!':
+        Advance(2);
+        lookaround_type = RegExpLookaround::LOOKAHEAD;
+        subexpr_type = NEGATIVE_LOOKAROUND;
+        break;
+      case '<':
+        Advance();
+        if (Next() == '=') {
+          Advance(2);
+          lookaround_type = RegExpLookaround::LOOKBEHIND;
+          subexpr_type = POSITIVE_LOOKAROUND;
+          break;
+        } else if (Next() == '!') {
+          Advance(2);
+          lookaround_type = RegExpLookaround::LOOKBEHIND;
+          subexpr_type = NEGATIVE_LOOKAROUND;
+          break;
+        }
+        is_named_capture = true;
+        has_named_captures_ = true;
+        Advance();
+        break;
+      default:
+        ReportError(RegExpError::kInvalidGroup);
+        return nullptr;
+    }
+  }
+  if (subexpr_type == CAPTURE) {
+    if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) {
+      ReportError(RegExpError::kTooManyCaptures);
+      return nullptr;
+    }
+    captures_started_++;
+
+    if (is_named_capture) {
+      capture_name = ParseCaptureGroupName(CHECK_FAILED);
+    }
+  }
+  // Store current state and begin new disjunction parsing.
+  return zone()->template New<RegExpParserState>(
+      state, subexpr_type, lookaround_type, captures_started_, capture_name,
+      state->builder()->flags(), zone());
+}
+
+#ifdef DEBUG
+namespace {
+
+bool IsSpecialClassEscape(base::uc32 c) {
+  switch (c) {
+    case 'd':
+    case 'D':
+    case 's':
+    case 'S':
+    case 'w':
+    case 'W':
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+#endif
+
+// In order to know whether an escape is a backreference or not we have to scan
+// the entire regexp and find the number of capturing parentheses.  However we
+// don't want to scan the regexp twice unless it is necessary.  This mini-parser
+// is called when needed.  It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and backslash-escaped
+// characters.
+//
+// Important: The scanner has to be in a consistent state when calling
+// ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while
+// parsing a nested class.
+template <class CharT>
+void RegExpParserImpl<CharT>::ScanForCaptures(
+    InClassEscapeState in_class_escape_state) {
+  DCHECK(!is_scanned_for_captures_);
+  const int saved_position = position();
+  // Start with captures started previous to current position
+  int capture_count = captures_started();
+  // When we start inside a character class, skip everything inside the class.
+  if (in_class_escape_state == InClassEscapeState::kInClass) {
+    // \k is always invalid within a class in unicode mode, thus we should never
+    // call ScanForCaptures within a class.
+    DCHECK(!IsUnicodeMode());
+    int c;
+    while ((c = current()) != kEndMarker) {
+      Advance();
+      if (c == '\\') {
+        Advance();
+      } else {
+        if (c == ']') break;
+      }
+    }
+  }
+  // Add count of captures after this position.
+  int n;
+  while ((n = current()) != kEndMarker) {
+    Advance();
+    switch (n) {
+      case '\\':
+        Advance();
+        break;
+      case '[': {
+        int class_nest_level = 0;
+        int c;
+        while ((c = current()) != kEndMarker) {
+          Advance();
+          if (c == '\\') {
+            Advance();
+          } else if (c == '[') {
+            // With /v, '[' inside a class is treated as a nested class.
+            // Without /v, '[' is a normal character.
+            if (unicode_sets()) class_nest_level++;
+          } else if (c == ']') {
+            if (class_nest_level == 0) break;
+            class_nest_level--;
+          }
+        }
+        break;
+      }
+      case '(':
+        if (current() == '?') {
+          // At this point we could be in
+          // * a non-capturing group '(:',
+          // * a lookbehind assertion '(?<=' '(?<!'
+          // * or a named capture '(?<'.
+          //
+          // Of these, only named captures are capturing groups.
+
+          Advance();
+          if (current() != '<') break;
+
+          Advance();
+          if (current() == '=' || current() == '!') break;
+
+          // Found a possible named capture. It could turn out to be a syntax
+          // error (e.g. an unterminated or invalid name), but that distinction
+          // does not matter for our purposes.
+          has_named_captures_ = true;
+        }
+        capture_count++;
+        break;
+    }
+  }
+  capture_count_ = capture_count;
+  is_scanned_for_captures_ = true;
+  Reset(saved_position);
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
+  DCHECK_EQ('\\', current());
+  DCHECK('1' <= Next() && Next() <= '9');
+  // Try to parse a decimal literal that is no greater than the total number
+  // of left capturing parentheses in the input.
+  int start = position();
+  int value = Next() - '0';
+  Advance(2);
+  while (true) {
+    base::uc32 c = current();
+    if (IsDecimalDigit(c)) {
+      value = 10 * value + (c - '0');
+      if (value > RegExpMacroAssembler::kMaxCaptures) {
+        Reset(start);
+        return false;
+      }
+      Advance();
+    } else {
+      break;
+    }
+  }
+  if (value > captures_started()) {
+    if (!is_scanned_for_captures_) {
+      ScanForCaptures(InClassEscapeState::kNotInClass);
+    }
+    if (value > capture_count_) {
+      Reset(start);
+      return false;
+    }
+  }
+  *index_out = value;
+  return true;
+}
+
+namespace {
+
+void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
+  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+    v->push_back(code_unit);
+  } else {
+    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
+    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
+  }
+}
+
+}  // namespace
+
+template <class CharT>
+const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
+  // Due to special Advance requirements (see the next comment), rewind by one
+  // such that names starting with a surrogate pair are parsed correctly for
+  // patterns where the unicode flag is unset.
+  //
+  // Note that we use this odd pattern of rewinding the last advance in order
+  // to adhere to the common parser behavior of expecting `current` to point at
+  // the first candidate character for a function (e.g. when entering ParseFoo,
+  // `current` should point at the first character of Foo).
+  RewindByOneCodepoint();
+
+  ZoneVector<base::uc16>* name =
+      zone()->template New<ZoneVector<base::uc16>>(zone());
+
+  {
+    // Advance behavior inside this function is tricky since
+    // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
+    // and thus allows surrogate pairs and \u{}-style escapes even in
+    // non-unicode patterns. Therefore Advance within the capture group name
+    // has to force-enable unicode, and outside the name revert to default
+    // behavior.
+    ForceUnicodeScope force_unicode(this);
+
+    bool at_start = true;
+    while (true) {
+      Advance();
+      base::uc32 c = current();
+
+      // Convert unicode escapes.
+      if (c == '\\' && Next() == 'u') {
+        Advance(2);
+        if (!ParseUnicodeEscape(&c)) {
+          ReportError(RegExpError::kInvalidUnicodeEscape);
+          return nullptr;
+        }
+        RewindByOneCodepoint();
+      }
+
+      // The backslash char is misclassified as both ID_Start and ID_Continue.
+      if (c == '\\') {
+        ReportError(RegExpError::kInvalidCaptureGroupName);
+        return nullptr;
+      }
+
+      if (at_start) {
+        if (!IsIdentifierStart(c)) {
+          ReportError(RegExpError::kInvalidCaptureGroupName);
+          return nullptr;
+        }
+        push_code_unit(name, c);
+        at_start = false;
+      } else {
+        if (c == '>') {
+          break;
+        } else if (IsIdentifierPart(c)) {
+          push_code_unit(name, c);
+        } else {
+          ReportError(RegExpError::kInvalidCaptureGroupName);
+          return nullptr;
+        }
+      }
+    }
+  }
+
+  // This final advance goes back into the state of pointing at the next
+  // relevant char, which the rest of the parser expects. See also the previous
+  // comments in this function.
+  Advance();
+  return name;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
+    const ZoneVector<base::uc16>* name, int index) {
+  DCHECK(0 < index && index <= captures_started_);
+  DCHECK_NOT_NULL(name);
+
+  RegExpCapture* capture = GetCapture(index);
+  DCHECK_NULL(capture->name());
+
+  capture->set_name(name);
+
+  if (named_captures_ == nullptr) {
+    named_captures_ =
+        zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
+            zone());
+  } else {
+    // Check for duplicates and bail if we find any.
+
+    const auto& named_capture_it = named_captures_->find(capture);
+    if (named_capture_it != named_captures_->end()) {
+      ReportError(RegExpError::kDuplicateCaptureGroupName);
+      return false;
+    }
+  }
+
+  named_captures_->emplace(capture);
+
+  return true;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseNamedBackReference(
+    RegExpBuilder* builder, RegExpParserState* state) {
+  // The parser is assumed to be on the '<' in \k<name>.
+  if (current() != '<') {
+    ReportError(RegExpError::kInvalidNamedReference);
+    return false;
+  }
+
+  Advance();
+  const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
+  if (name == nullptr) {
+    return false;
+  }
+
+  if (state->IsInsideCaptureGroup(name)) {
+    builder->AddEmpty();
+  } else {
+    RegExpBackReference* atom =
+        zone()->template New<RegExpBackReference>(builder->flags());
+    atom->set_name(name);
+
+    builder->AddAtom(atom);
+
+    if (named_back_references_ == nullptr) {
+      named_back_references_ =
+          zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
+    }
+    named_back_references_->Add(atom, zone());
+  }
+
+  return true;
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
+  if (named_back_references_ == nullptr) return;
+
+  if (named_captures_ == nullptr) {
+    ReportError(RegExpError::kInvalidNamedCaptureReference);
+    return;
+  }
+
+  // Look up and patch the actual capture for each named back reference.
+
+  for (int i = 0; i < named_back_references_->length(); i++) {
+    RegExpBackReference* ref = named_back_references_->at(i);
+
+    // Capture used to search the named_captures_ by name, index of the
+    // capture is never used.
+    static const int kInvalidIndex = 0;
+    RegExpCapture* search_capture =
+        zone()->template New<RegExpCapture>(kInvalidIndex);
+    DCHECK_NULL(search_capture->name());
+    search_capture->set_name(ref->name());
+
+    int index = -1;
+    const auto& capture_it = named_captures_->find(search_capture);
+    if (capture_it != named_captures_->end()) {
+      index = (*capture_it)->index();
+    } else {
+      ReportError(RegExpError::kInvalidNamedCaptureReference);
+      return;
+    }
+
+    ref->set_capture(GetCapture(index));
+  }
+}
+
+template <class CharT>
+RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
+  // The index for the capture groups are one-based. Its index in the list is
+  // zero-based.
+  const int known_captures =
+      is_scanned_for_captures_ ? capture_count_ : captures_started_;
+  DCHECK(index <= known_captures);
+  if (captures_ == nullptr) {
+    captures_ =
+        zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
+  }
+  while (captures_->length() < known_captures) {
+    captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
+                   zone());
+  }
+  return captures_->at(index - 1);
+}
+
+template <class CharT>
+ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
+  if (named_captures_ == nullptr || named_captures_->empty()) {
+    return nullptr;
+  }
+
+  return zone()->template New<ZoneVector<RegExpCapture*>>(
+      named_captures_->begin(), named_captures_->end(), zone());
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::HasNamedCaptures(
+    InClassEscapeState in_class_escape_state) {
+  if (has_named_captures_ || is_scanned_for_captures_) {
+    return has_named_captures_;
+  }
+
+  ScanForCaptures(in_class_escape_state);
+  DCHECK(is_scanned_for_captures_);
+  return has_named_captures_;
+}
+
+// QuantifierPrefix ::
+//   { DecimalDigits }
+//   { DecimalDigits , }
+//   { DecimalDigits , DecimalDigits }
+//
+// Returns true if parsing succeeds, and set the min_out and max_out
+// values. Values are truncated to RegExpTree::kInfinity if they overflow.
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
+                                                      int* max_out) {
+  DCHECK_EQ(current(), '{');
+  int start = position();
+  Advance();
+  int min = 0;
+  if (!IsDecimalDigit(current())) {
+    Reset(start);
+    return false;
+  }
+  while (IsDecimalDigit(current())) {
+    int next = current() - '0';
+    if (min > (RegExpTree::kInfinity - next) / 10) {
+      // Overflow. Skip past remaining decimal digits and return -1.
+      do {
+        Advance();
+      } while (IsDecimalDigit(current()));
+      min = RegExpTree::kInfinity;
+      break;
+    }
+    min = 10 * min + next;
+    Advance();
+  }
+  int max = 0;
+  if (current() == '}') {
+    max = min;
+    Advance();
+  } else if (current() == ',') {
+    Advance();
+    if (current() == '}') {
+      max = RegExpTree::kInfinity;
+      Advance();
+    } else {
+      while (IsDecimalDigit(current())) {
+        int next = current() - '0';
+        if (max > (RegExpTree::kInfinity - next) / 10) {
+          do {
+            Advance();
+          } while (IsDecimalDigit(current()));
+          max = RegExpTree::kInfinity;
+          break;
+        }
+        max = 10 * max + next;
+        Advance();
+      }
+      if (current() != '}') {
+        Reset(start);
+        return false;
+      }
+      Advance();
+    }
+  } else {
+    Reset(start);
+    return false;
+  }
+  *min_out = min;
+  *max_out = max;
+  return true;
+}
+
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
+  DCHECK(('0' <= current() && current() <= '7') || !has_more());
+  // For compatibility with some other browsers (not all), we parse
+  // up to three octal digits with a value below 256.
+  // ES#prod-annexB-LegacyOctalEscapeSequence
+  base::uc32 value = current() - '0';
+  Advance();
+  if ('0' <= current() && current() <= '7') {
+    value = value * 8 + current() - '0';
+    Advance();
+    if (value < 32 && '0' <= current() && current() <= '7') {
+      value = value * 8 + current() - '0';
+      Advance();
+    }
+  }
+  return value;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
+  int start = position();
+  base::uc32 val = 0;
+  for (int i = 0; i < length; ++i) {
+    base::uc32 c = current();
+    int d = base::HexValue(c);
+    if (d < 0) {
+      Reset(start);
+      return false;
+    }
+    val = val * 16 + d;
+    Advance();
+  }
+  *value = val;
+  return true;
+}
+
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
+  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
+  // allowed). In the latter case, the number of hex digits between { } is
+  // arbitrary. \ and u have already been read.
+  if (current() == '{' && IsUnicodeMode()) {
+    int start = position();
+    Advance();
+    if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
+      if (current() == '}') {
+        Advance();
+        return true;
+      }
+    }
+    Reset(start);
+    return false;
+  }
+  // \u but no {, or \u{...} escapes not allowed.
+  bool result = ParseHexEscape(4, value);
+  if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+      current() == '\\') {
+    // Attempt to read trail surrogate.
+    int start = position();
+    if (Next() == 'u') {
+      Advance(2);
+      base::uc32 trail;
+      if (ParseHexEscape(4, &trail) &&
+          unibrow::Utf16::IsTrailSurrogate(trail)) {
+        *value = unibrow::Utf16::CombineSurrogatePair(
+            static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
+        return true;
+      }
+    }
+    Reset(start);
+  }
+  return result;
+}
+
+#ifdef V8_INTL_SUPPORT
+
+namespace {
+
+bool IsExactPropertyAlias(const char* property_name, UProperty property) {
+  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
+  if (short_name != nullptr && strcmp(property_name, short_name) == 0)
+    return true;
+  for (int i = 0;; i++) {
+    const char* long_name = u_getPropertyName(
+        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
+    if (long_name == nullptr) break;
+    if (strcmp(property_name, long_name) == 0) return true;
+  }
+  return false;
+}
+
+bool IsExactPropertyValueAlias(const char* property_value_name,
+                               UProperty property, int32_t property_value) {
+  const char* short_name =
+      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
+  if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
+    return true;
+  }
+  for (int i = 0;; i++) {
+    const char* long_name = u_getPropertyValueName(
+        property, property_value,
+        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
+    if (long_name == nullptr) break;
+    if (strcmp(property_value_name, long_name) == 0) return true;
+  }
+  return false;
+}
+
+void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set,
+                                  CharacterClassStrings* strings,
+                                  RegExpFlags flags, Zone* zone) {
+  DCHECK(set.hasStrings());
+  DCHECK(IsUnicodeSets(flags));
+  DCHECK_NOT_NULL(strings);
+
+  RegExpTextBuilder::SmallRegExpTreeVector string_storage(
+      ZoneAllocator<RegExpTree*>{zone});
+  RegExpTextBuilder string_builder(zone, &string_storage, flags);
+  const bool needs_case_folding = IsIgnoreCase(flags);
+  icu::UnicodeSetIterator iter(set);
+  iter.skipToStrings();
+  while (iter.next()) {
+    const icu::UnicodeString& s = iter.getString();
+    const char16_t* p = s.getBuffer();
+    int32_t length = s.length();
+    ZoneList<base::uc32>* string =
+        zone->template New<ZoneList<base::uc32>>(length, zone);
+    for (int32_t i = 0; i < length;) {
+      UChar32 c;
+      U16_NEXT(p, i, length, c);
+      string_builder.AddUnicodeCharacter(c);
+      if (needs_case_folding) {
+        c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+      }
+      string->Add(c, zone);
+    }
+    strings->emplace(string->ToVector(), string_builder.ToRegExp());
+    string_storage.clear();
+  }
+}
+
+bool LookupPropertyValueName(UProperty property,
+                             const char* property_value_name, bool negate,
+                             ZoneList<CharacterRange>* result_ranges,
+                             CharacterClassStrings* result_strings,
+                             RegExpFlags flags, Zone* zone) {
+  UProperty property_for_lookup = property;
+  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
+    // For the property Script_Extensions, we have to do the property value
+    // name lookup as if the property is Script.
+    property_for_lookup = UCHAR_SCRIPT;
+  }
+  int32_t property_value =
+      u_getPropertyValueEnum(property_for_lookup, property_value_name);
+  if (property_value == UCHAR_INVALID_CODE) return false;
+
+  // We require the property name to match exactly to one of the property value
+  // aliases. However, u_getPropertyValueEnum uses loose matching.
+  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
+                                 property_value)) {
+    return false;
+  }
+
+  UErrorCode ec = U_ZERO_ERROR;
+  icu::UnicodeSet set;
+  set.applyIntPropertyValue(property, property_value, ec);
+  bool success = ec == U_ZERO_ERROR && !set.isEmpty();
+
+  if (success) {
+    if (set.hasStrings()) {
+      ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
+    }
+    const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
+    if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);
+    set.removeAllStrings();
+    if (negate) set.complement();
+    for (int i = 0; i < set.getRangeCount(); i++) {
+      result_ranges->Add(
+          CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
+          zone);
+    }
+  }
+  return success;
+}
+
+template <size_t N>
+inline bool NameEquals(const char* name, const char (&literal)[N]) {
+  return strncmp(name, literal, N + 1) == 0;
+}
+
+bool LookupSpecialPropertyValueName(const char* name,
+                                    ZoneList<CharacterRange>* result,
+                                    bool negate, RegExpFlags flags,
+                                    Zone* zone) {
+  if (NameEquals(name, "Any")) {
+    if (negate) {
+      // Leave the list of character ranges empty, since the negation of 'Any'
+      // is the empty set.
+    } else {
+      result->Add(CharacterRange::Everything(), zone);
+    }
+  } else if (NameEquals(name, "ASCII")) {
+    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
+                       : CharacterRange::Range(0x0, 0x7F),
+                zone);
+  } else if (NameEquals(name, "Assigned")) {
+    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
+                                   !negate, result, nullptr, flags, zone);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+// Explicitly allowlist supported binary properties. The spec forbids supporting
+// properties outside of this set to ensure interoperability.
+bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) {
+  switch (property) {
+    case UCHAR_ALPHABETIC:
+    // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
+    // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
+    case UCHAR_ASCII_HEX_DIGIT:
+    // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
+    case UCHAR_BIDI_CONTROL:
+    case UCHAR_BIDI_MIRRORED:
+    case UCHAR_CASE_IGNORABLE:
+    case UCHAR_CASED:
+    case UCHAR_CHANGES_WHEN_CASEFOLDED:
+    case UCHAR_CHANGES_WHEN_CASEMAPPED:
+    case UCHAR_CHANGES_WHEN_LOWERCASED:
+    case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
+    case UCHAR_CHANGES_WHEN_TITLECASED:
+    case UCHAR_CHANGES_WHEN_UPPERCASED:
+    case UCHAR_DASH:
+    case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
+    case UCHAR_DEPRECATED:
+    case UCHAR_DIACRITIC:
+    case UCHAR_EMOJI:
+    case UCHAR_EMOJI_COMPONENT:
+    case UCHAR_EMOJI_MODIFIER_BASE:
+    case UCHAR_EMOJI_MODIFIER:
+    case UCHAR_EMOJI_PRESENTATION:
+    case UCHAR_EXTENDED_PICTOGRAPHIC:
+    case UCHAR_EXTENDER:
+    case UCHAR_GRAPHEME_BASE:
+    case UCHAR_GRAPHEME_EXTEND:
+    case UCHAR_HEX_DIGIT:
+    case UCHAR_ID_CONTINUE:
+    case UCHAR_ID_START:
+    case UCHAR_IDEOGRAPHIC:
+    case UCHAR_IDS_BINARY_OPERATOR:
+    case UCHAR_IDS_TRINARY_OPERATOR:
+    case UCHAR_JOIN_CONTROL:
+    case UCHAR_LOGICAL_ORDER_EXCEPTION:
+    case UCHAR_LOWERCASE:
+    case UCHAR_MATH:
+    case UCHAR_NONCHARACTER_CODE_POINT:
+    case UCHAR_PATTERN_SYNTAX:
+    case UCHAR_PATTERN_WHITE_SPACE:
+    case UCHAR_QUOTATION_MARK:
+    case UCHAR_RADICAL:
+    case UCHAR_REGIONAL_INDICATOR:
+    case UCHAR_S_TERM:
+    case UCHAR_SOFT_DOTTED:
+    case UCHAR_TERMINAL_PUNCTUATION:
+    case UCHAR_UNIFIED_IDEOGRAPH:
+    case UCHAR_UPPERCASE:
+    case UCHAR_VARIATION_SELECTOR:
+    case UCHAR_WHITE_SPACE:
+    case UCHAR_XID_CONTINUE:
+    case UCHAR_XID_START:
+      return true;
+    case UCHAR_BASIC_EMOJI:
+    case UCHAR_EMOJI_KEYCAP_SEQUENCE:
+    case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
+    case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
+    case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
+    case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
+    case UCHAR_RGI_EMOJI:
+      return unicode_sets;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool IsBinaryPropertyOfStrings(UProperty property) {
+  switch (property) {
+    case UCHAR_BASIC_EMOJI:
+    case UCHAR_EMOJI_KEYCAP_SEQUENCE:
+    case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
+    case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
+    case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
+    case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
+    case UCHAR_RGI_EMOJI:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool IsUnicodePropertyValueCharacter(char c) {
+  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
+  //
+  // Note that using this to validate each parsed char is quite conservative.
+  // A possible alternative solution would be to only ensure the parsed
+  // property name/value candidate string does not contain '\0' characters and
+  // let ICU lookups trigger the final failure.
+  if ('a' <= c && c <= 'z') return true;
+  if ('A' <= c && c <= 'Z') return true;
+  if ('0' <= c && c <= '9') return true;
+  return (c == '_');
+}
+
+}  // namespace
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
+                                                     ZoneVector<char>* name_2) {
+  DCHECK(name_1->empty());
+  DCHECK(name_2->empty());
+  // Parse the property class as follows:
+  // - In \p{name}, 'name' is interpreted
+  //   - either as a general category property value name.
+  //   - or as a binary property name.
+  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
+  //   and 'value' is interpreted as one of the available property value names.
+  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
+  // - Loose matching is not applied.
+  if (current() == '{') {
+    // Parse \p{[PropertyName=]PropertyNameValue}
+    for (Advance(); current() != '}' && current() != '='; Advance()) {
+      if (!IsUnicodePropertyValueCharacter(current())) return false;
+      if (!has_next()) return false;
+      name_1->push_back(static_cast<char>(current()));
+    }
+    if (current() == '=') {
+      for (Advance(); current() != '}'; Advance()) {
+        if (!IsUnicodePropertyValueCharacter(current())) return false;
+        if (!has_next()) return false;
+        name_2->push_back(static_cast<char>(current()));
+      }
+      name_2->push_back(0);  // null-terminate string.
+    }
+  } else {
+    return false;
+  }
+  Advance();
+  name_1->push_back(0);  // null-terminate string.
+
+  DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
+  DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
+  return true;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::AddPropertyClassRange(
+    ZoneList<CharacterRange>* add_to_ranges,
+    CharacterClassStrings* add_to_strings, bool negate,
+    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
+  if (name_2.empty()) {
+    // First attempt to interpret as general category property value name.
+    const char* name = name_1.data();
+    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
+                                add_to_ranges, add_to_strings, flags(),
+                                zone())) {
+      return true;
+    }
+    // Interpret "Any", "ASCII", and "Assigned".
+    if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(),
+                                       zone())) {
+      return true;
+    }
+    // Then attempt to interpret as binary property name with value name 'Y'.
+    UProperty property = u_getPropertyEnum(name);
+    if (!IsSupportedBinaryProperty(property, unicode_sets())) return false;
+    if (!IsExactPropertyAlias(name, property)) return false;
+    // Negation of properties with strings is not allowed.
+    // TODO(v8:11935): Change permalink once proposal is in stage 4.
+    // See
+    // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-static-semantics-maycontainstrings
+    if (negate && IsBinaryPropertyOfStrings(property)) return false;
+    return LookupPropertyValueName(property, negate ? "N" : "Y", false,
+                                   add_to_ranges, add_to_strings, flags(),
+                                   zone());
+  } else {
+    // Both property name and value name are specified. Attempt to interpret
+    // the property name as enumerated property.
+    const char* property_name = name_1.data();
+    const char* value_name = name_2.data();
+    UProperty property = u_getPropertyEnum(property_name);
+    if (!IsExactPropertyAlias(property_name, property)) return false;
+    if (property == UCHAR_GENERAL_CATEGORY) {
+      // We want to allow aggregate value names such as "Letter".
+      property = UCHAR_GENERAL_CATEGORY_MASK;
+    } else if (property != UCHAR_SCRIPT &&
+               property != UCHAR_SCRIPT_EXTENSIONS) {
+      return false;
+    }
+    return LookupPropertyValueName(property, value_name, negate, add_to_ranges,
+                                   add_to_strings, flags(), zone());
+  }
+}
+
+#else  // V8_INTL_SUPPORT
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
+                                                     ZoneVector<char>* name_2) {
+  return false;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::AddPropertyClassRange(
+    ZoneList<CharacterRange>* add_to_ranges,
+    CharacterClassStrings* add_to_strings, bool negate,
+    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
+  return false;
+}
+
+#endif  // V8_INTL_SUPPORT
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
+                                                            base::uc32* value) {
+  base::uc32 x = 0;
+  int d = base::HexValue(current());
+  if (d < 0) {
+    return false;
+  }
+  while (d >= 0) {
+    x = x * 16 + d;
+    if (x > static_cast<base::uc32>(max_value)) {
+      return false;
+    }
+    Advance();
+    d = base::HexValue(current());
+  }
+  *value = x;
+  return true;
+}
+
+// https://tc39.es/ecma262/#prod-CharacterEscape
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
+    InClassEscapeState in_class_escape_state,
+    bool* is_escaped_unicode_character) {
+  DCHECK_EQ('\\', current());
+  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
+
+  Advance();
+
+  const base::uc32 c = current();
+  switch (c) {
+    // CharacterEscape ::
+    //   ControlEscape :: one of
+    //     f n r t v
+    case 'f':
+      Advance();
+      return '\f';
+    case 'n':
+      Advance();
+      return '\n';
+    case 'r':
+      Advance();
+      return '\r';
+    case 't':
+      Advance();
+      return '\t';
+    case 'v':
+      Advance();
+      return '\v';
+    // CharacterEscape ::
+    //   c ControlLetter
+    case 'c': {
+      base::uc32 controlLetter = Next();
+      base::uc32 letter = controlLetter & ~('A' ^ 'a');
+      if (letter >= 'A' && letter <= 'Z') {
+        Advance(2);
+        // Control letters mapped to ASCII control characters in the range
+        // 0x00-0x1F.
+        return controlLetter & 0x1F;
+      }
+      if (IsUnicodeMode()) {
+        // With /u and /v, invalid escapes are not treated as identity escapes.
+        ReportError(RegExpError::kInvalidUnicodeEscape);
+        return 0;
+      }
+      if (in_class_escape_state == InClassEscapeState::kInClass) {
+        // Inside a character class, we also accept digits and underscore as
+        // control characters, unless with /u or /v. See Annex B:
+        // ES#prod-annexB-ClassControlLetter
+        if ((controlLetter >= '0' && controlLetter <= '9') ||
+            controlLetter == '_') {
+          Advance(2);
+          return controlLetter & 0x1F;
+        }
+      }
+      // We match JSC in reading the backslash as a literal
+      // character instead of as starting an escape.
+      return '\\';
+    }
+    // CharacterEscape ::
+    //   0 [lookahead ∉ DecimalDigit]
+    //   [~UnicodeMode] LegacyOctalEscapeSequence
+    case '0':
+      // \0 is interpreted as NUL if not followed by another digit.
+      if (Next() < '0' || Next() > '9') {
+        Advance();
+        return 0;
+      }
+      V8_FALLTHROUGH;
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+      // For compatibility, we interpret a decimal escape that isn't
+      // a back reference (and therefore either \0 or not valid according
+      // to the specification) as a 1..3 digit octal character code.
+      // ES#prod-annexB-LegacyOctalEscapeSequence
+      if (IsUnicodeMode()) {
+        // With /u or /v, decimal escape is not interpreted as octal character
+        // code.
+        ReportError(RegExpError::kInvalidClassEscape);
+        return 0;
+      }
+      return ParseOctalLiteral();
+    // CharacterEscape ::
+    //   HexEscapeSequence
+    case 'x': {
+      Advance();
+      base::uc32 value;
+      if (ParseHexEscape(2, &value)) return value;
+      if (IsUnicodeMode()) {
+        // With /u or /v, invalid escapes are not treated as identity escapes.
+        ReportError(RegExpError::kInvalidEscape);
+        return 0;
+      }
+      // If \x is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'x';
+    }
+    // CharacterEscape ::
+    //   RegExpUnicodeEscapeSequence [?UnicodeMode]
+    case 'u': {
+      Advance();
+      base::uc32 value;
+      if (ParseUnicodeEscape(&value)) {
+        *is_escaped_unicode_character = true;
+        return value;
+      }
+      if (IsUnicodeMode()) {
+        // With /u or /v, invalid escapes are not treated as identity escapes.
+        ReportError(RegExpError::kInvalidUnicodeEscape);
+        return 0;
+      }
+      // If \u is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'u';
+    }
+    default:
+      break;
+  }
+
+  // CharacterEscape ::
+  //   IdentityEscape[?UnicodeMode, ?N]
+  //
+  // * With /u, no identity escapes except for syntax characters are
+  //   allowed.
+  // * With /v, no identity escapes except for syntax characters and
+  //   ClassSetReservedPunctuators (if within a class) are allowed.
+  // * Without /u or /v:
+  //   * '\c' is not an IdentityEscape.
+  //   * '\k' is not an IdentityEscape when named captures exist.
+  //   * Otherwise, all identity escapes are allowed.
+  if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) {
+    if (IsClassSetReservedPunctuator(c)) {
+      Advance();
+      return c;
+    }
+  }
+  if (IsUnicodeMode()) {
+    if (!IsSyntaxCharacterOrSlash(c)) {
+      ReportError(RegExpError::kInvalidEscape);
+      return 0;
+    }
+    Advance();
+    return c;
+  }
+  DCHECK(!IsUnicodeMode());
+  if (c == 'c') {
+    ReportError(RegExpError::kInvalidEscape);
+    return 0;
+  }
+  Advance();
+  // Note: It's important to Advance before the HasNamedCaptures call s.t. we
+  // don't start scanning in the middle of an escape.
+  if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
+    ReportError(RegExpError::kInvalidEscape);
+    return 0;
+  }
+  return c;
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassRanges
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(
+    ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) {
+  base::uc32 char_1, char_2;
+  bool is_class_1, is_class_2;
+  while (has_more() && current() != ']') {
+    ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
+                     &is_class_1 CHECK_FAILED);
+    // ClassAtom
+    if (current() == '-') {
+      Advance();
+      if (!has_more()) {
+        // If we reach the end we break out of the loop and let the
+        // following code report an error.
+        break;
+      } else if (current() == ']') {
+        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+        ranges->Add(CharacterRange::Singleton('-'), zone());
+        break;
+      }
+      ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
+                       &is_class_2 CHECK_FAILED);
+      if (is_class_1 || is_class_2) {
+        // Either end is an escaped character class. Treat the '-' verbatim.
+        if (IsUnicodeMode()) {
+          // ES2015 21.2.2.15.1 step 1.
+          return ReportError(RegExpError::kInvalidCharacterClass);
+        }
+        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+        ranges->Add(CharacterRange::Singleton('-'), zone());
+        if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
+        continue;
+      }
+      // ES2015 21.2.2.15.1 step 6.
+      if (char_1 > char_2) {
+        return ReportError(RegExpError::kOutOfOrderCharacterClass);
+      }
+      ranges->Add(CharacterRange::Range(char_1, char_2), zone());
+    } else {
+      if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+    }
+  }
+  return nullptr;
+}
+
+// https://tc39.es/ecma262/#prod-ClassEscape
+template <class CharT>
+void RegExpParserImpl<CharT>::ParseClassEscape(
+    ZoneList<CharacterRange>* ranges, Zone* zone,
+    bool add_unicode_case_equivalents, base::uc32* char_out,
+    bool* is_class_escape) {
+  *is_class_escape = false;
+
+  if (current() != '\\') {
+    // Not a ClassEscape.
+    *char_out = current();
+    Advance();
+    return;
+  }
+
+  const base::uc32 next = Next();
+  switch (next) {
+    case 'b':
+      *char_out = '\b';
+      Advance(2);
+      return;
+    case '-':
+      if (IsUnicodeMode()) {
+        *char_out = next;
+        Advance(2);
+        return;
+      }
+      break;
+    case kEndMarker:
+      ReportError(RegExpError::kEscapeAtEndOfPattern);
+      return;
+    default:
+      break;
+  }
+
+  static constexpr InClassEscapeState kInClassEscape =
+      InClassEscapeState::kInClass;
+  *is_class_escape =
+      TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone,
+                                   add_unicode_case_equivalents);
+  if (*is_class_escape) return;
+
+  bool dummy = false;  // Unused.
+  *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
+}
+
+// https://tc39.es/ecma262/#prod-CharacterClassEscape
+template <class CharT>
+bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
+    base::uc32 next, InClassEscapeState in_class_escape_state,
+    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
+    Zone* zone, bool add_unicode_case_equivalents) {
+  DCHECK_EQ(current(), '\\');
+  DCHECK_EQ(Next(), next);
+
+  switch (next) {
+    case 'd':
+    case 'D':
+    case 's':
+    case 'S':
+    case 'w':
+    case 'W':
+      CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
+                                     ranges, add_unicode_case_equivalents,
+                                     zone);
+      Advance(2);
+      return true;
+    case 'p':
+    case 'P': {
+      if (!IsUnicodeMode()) return false;
+      bool negate = next == 'P';
+      Advance(2);
+      ZoneVector<char> name_1(zone);
+      ZoneVector<char> name_2(zone);
+      if (!ParsePropertyClassName(&name_1, &name_2) ||
+          !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) {
+        ReportError(in_class_escape_state == InClassEscapeState::kInClass
+                        ? RegExpError::kInvalidClassPropertyName
+                        : RegExpError::kInvalidPropertyName);
+      }
+      return true;
+    }
+    default:
+      return false;
+  }
+}
+
+namespace {
+
+// Add |string| to |ranges| if length of |string| == 1, otherwise add |string|
+// to |strings|.
+void AddClassString(ZoneList<base::uc32>* normalized_string,
+                    RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,
+                    CharacterClassStrings* strings, Zone* zone) {
+  if (normalized_string->length() == 1) {
+    ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);
+  } else {
+    strings->emplace(normalized_string->ToVector(), regexp_string);
+  }
+}
+
+}  // namespace
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassStringDisjunction
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
+    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
+  DCHECK(unicode_sets());
+  DCHECK_EQ(current(), '\\');
+  DCHECK_EQ(Next(), 'q');
+  Advance(2);
+  if (current() != '{') {
+    // Identity escape of 'q' is not allowed in unicode mode.
+    return ReportError(RegExpError::kInvalidEscape);
+  }
+  Advance();
+
+  ZoneList<base::uc32>* string =
+      zone()->template New<ZoneList<base::uc32>>(4, zone());
+  RegExpTextBuilder::SmallRegExpTreeVector string_storage(
+      ZoneAllocator<RegExpTree*>{zone()});
+  RegExpTextBuilder string_builder(zone(), &string_storage, flags());
+
+  while (has_more() && current() != '}') {
+    if (current() == '|') {
+      AddClassString(string, string_builder.ToRegExp(), ranges, strings,
+                     zone());
+      string = zone()->template New<ZoneList<base::uc32>>(4, zone());
+      string_storage.clear();
+      Advance();
+    } else {
+      base::uc32 c = ParseClassSetCharacter(CHECK_FAILED);
+      if (ignore_case()) {
+#ifdef V8_INTL_SUPPORT
+        c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+#else
+        c = AsciiAlphaToLower(c);
+#endif
+      }
+      string->Add(c, zone());
+      string_builder.AddUnicodeCharacter(c);
+    }
+  }
+
+  AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone());
+
+  // We don't need to handle missing closing '}' here.
+  // If the character class is correctly closed, ParseClassSetCharacter will
+  // report an error.
+  DCHECK_EQ(current(), '}');
+  Advance();
+  return nullptr;
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
+// Tree returned based on type_out:
+//  * kNestedClass: RegExpClassSetExpression
+//  * For all other types: RegExpClassSetOperand
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
+    const RegExpBuilder* builder, ClassSetOperandType* type_out) {
+  ZoneList<CharacterRange>* ranges =
+      zone()->template New<ZoneList<CharacterRange>>(1, zone());
+  CharacterClassStrings* strings =
+      zone()->template New<CharacterClassStrings>(zone());
+  RegExpTree* tree =
+      ParseClassSetOperand(builder, type_out, ranges, strings CHECK_FAILED);
+  DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,
+                 tree == nullptr);
+  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
+                 ranges->length() == 1);
+  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
+                 strings->empty());
+  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
+                 ranges->is_empty());
+  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
+                 strings->empty());
+  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
+                 tree->IsClassSetExpression());
+  // ClassSetRange is only used within ClassSetUnion().
+  DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);
+  // There are no restrictions for kCharacterClassEscape.
+  // CharacterClassEscape includes \p{}, which can contain ranges, strings or
+  // both and \P{}, which could contain nothing (i.e. \P{Any}).
+  if (tree == nullptr) {
+    tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
+  }
+  return tree;
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
+// Based on |type_out| either a tree is returned or ranges/strings modified.
+// If a tree is returned, ranges/strings are not modified.
+// If |type_out| is kNestedClass, a tree of type RegExpClassSetExpression is
+// returned. For all other types, ranges is modified and nullptr is returned.
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
+    const RegExpBuilder* builder, ClassSetOperandType* type_out,
+    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
+  DCHECK(unicode_sets());
+  base::uc32 c = current();
+  if (c == '\\') {
+    const base::uc32 next = Next();
+    if (next == 'q') {
+      *type_out = ClassSetOperandType::kClassStringDisjunction;
+      ParseClassStringDisjunction(ranges, strings CHECK_FAILED);
+      return nullptr;
+    }
+    static constexpr InClassEscapeState kInClassEscape =
+        InClassEscapeState::kInClass;
+    const bool add_unicode_case_equivalents = ignore_case();
+    if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings,
+                                     zone(), add_unicode_case_equivalents)) {
+      *type_out = ClassSetOperandType::kCharacterClassEscape;
+      return nullptr;
+    }
+  }
+
+  if (c == '[') {
+    *type_out = ClassSetOperandType::kNestedClass;
+    return ParseCharacterClass(builder);
+  }
+
+  *type_out = ClassSetOperandType::kClassSetCharacter;
+  c = ParseClassSetCharacter(CHECK_FAILED);
+  ranges->Add(CharacterRange::Singleton(c), zone());
+  return nullptr;
+}
+
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {
+  DCHECK(unicode_sets());
+  const base::uc32 c = current();
+  if (c == '\\') {
+    const base::uc32 next = Next();
+    switch (next) {
+      case 'b':
+        Advance(2);
+        return '\b';
+      case kEndMarker:
+        ReportError(RegExpError::kEscapeAtEndOfPattern);
+        return 0;
+    }
+    static constexpr InClassEscapeState kInClassEscape =
+        InClassEscapeState::kInClass;
+
+    bool dummy = false;  // Unused.
+    return ParseCharacterEscape(kInClassEscape, &dummy);
+  }
+  if (IsClassSetSyntaxCharacter(c)) {
+    ReportError(RegExpError::kInvalidCharacterInClass);
+    return 0;
+  }
+  if (IsClassSetReservedDoublePunctuator(c)) {
+    ReportError(RegExpError::kInvalidClassSetOperation);
+    return 0;
+  }
+  Advance();
+  return c;
+}
+
+namespace {
+
+bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
+  switch (type) {
+    case ClassSetOperandType::kClassSetCharacter:
+    case ClassSetOperandType::kClassSetRange:
+      return false;
+    case ClassSetOperandType::kCharacterClassEscape:
+    case ClassSetOperandType::kClassStringDisjunction:
+      return operand->AsClassSetOperand()->has_strings();
+    case ClassSetOperandType::kNestedClass:
+      if (operand->IsClassRanges()) return false;
+      return operand->AsClassSetExpression()->may_contain_strings();
+  }
+}
+
+}  // namespace
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassUnion
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
+    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
+    ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
+    CharacterClassStrings* strings) {
+  DCHECK(unicode_sets());
+  ZoneList<RegExpTree*>* operands =
+      zone()->template New<ZoneList<RegExpTree*>>(2, zone());
+  bool may_contain_strings = false;
+  // Add the lhs to operands if necessary.
+  // Either the lhs values were added to |ranges|/|strings| (in which case
+  // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed
+  // as |first_operand| (in which case |ranges| and |strings| are empty).
+  if (first_operand != nullptr) {
+    may_contain_strings = MayContainStrings(first_operand_type, first_operand);
+    operands->Add(first_operand, zone());
+  }
+  ClassSetOperandType last_type = first_operand_type;
+  const bool needs_case_folding = ignore_case();
+  while (has_more() && current() != ']') {
+    if (current() == '-') {
+      // Mix of ClassSetRange and ClassSubtraction is not allowed.
+      if (Next() == '-') {
+        return ReportError(RegExpError::kInvalidClassSetOperation);
+      }
+      Advance();
+      if (!has_more()) {
+        // If we reach the end we break out of the loop and let the
+        // following code report an error.
+        break;
+      }
+      // If the lhs and rhs around '-' are both ClassSetCharacters, they
+      // represent a character range.
+      // In case one of them is not a ClassSetCharacter, it is a syntax error,
+      // as '-' can not be used unescaped within a class with /v.
+      // TODO(v8:11935): Change permalink once proposal is in stage 4.
+      // See
+      // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetRange
+      if (last_type != ClassSetOperandType::kClassSetCharacter) {
+        return ReportError(RegExpError::kInvalidCharacterClass);
+      }
+      ParseClassSetOperand(builder, &last_type, ranges, strings CHECK_FAILED);
+      if (last_type != ClassSetOperandType::kClassSetCharacter) {
+        return ReportError(RegExpError::kInvalidCharacterClass);
+      }
+      // Remove the last two singleton characters added to ranges, and combine
+      // them into a range.
+      auto rhs_ranges = ranges->RemoveLast();
+      auto lhs_ranges = ranges->RemoveLast();
+      DCHECK(lhs_ranges.IsSingleton());
+      DCHECK(rhs_ranges.IsSingleton());
+      base::uc32 from = lhs_ranges.from();
+      base::uc32 to = rhs_ranges.from();
+      if (from > to) {
+        return ReportError(RegExpError::kOutOfOrderCharacterClass);
+      }
+      ranges->Add(CharacterRange::Range(from, to), zone());
+      last_type = ClassSetOperandType::kClassSetRange;
+    } else {
+      DCHECK_NE(current(), '-');
+      RegExpTree* operand = ParseClassSetOperand(builder, &last_type, ranges,
+                                                 strings CHECK_FAILED);
+      if (operand != nullptr) {
+        may_contain_strings |= MayContainStrings(last_type, operand);
+        // Add the range we started building as operand and reset the current
+        // range.
+        if (!ranges->is_empty() || !strings->empty()) {
+          if (needs_case_folding) {
+            CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
+          }
+          may_contain_strings |= !strings->empty();
+          operands->Add(
+              zone()->template New<RegExpClassSetOperand>(ranges, strings),
+              zone());
+          ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());
+          strings = zone()->template New<CharacterClassStrings>(zone());
+        }
+        operands->Add(operand, zone());
+      }
+    }
+  }
+
+  if (!has_more()) {
+    return ReportError(RegExpError::kUnterminatedCharacterClass);
+  }
+
+  // Add the range we started building as operand.
+  if (!ranges->is_empty() || !strings->empty()) {
+    if (needs_case_folding) {
+      CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
+    }
+    may_contain_strings |= !strings->empty();
+    operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),
+                  zone());
+  }
+
+  DCHECK_EQ(current(), ']');
+  Advance();
+
+  if (is_negated && may_contain_strings) {
+    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
+  }
+
+  return zone()->template New<RegExpClassSetExpression>(
+      RegExpClassSetExpression::OperationType::kUnion, is_negated,
+      may_contain_strings, operands);
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassIntersection
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
+    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
+    ClassSetOperandType first_operand_type) {
+  DCHECK(unicode_sets());
+  DCHECK(current() == '&' && Next() == '&');
+  bool may_contain_strings =
+      MayContainStrings(first_operand_type, first_operand);
+  ZoneList<RegExpTree*>* operands =
+      zone()->template New<ZoneList<RegExpTree*>>(2, zone());
+  operands->Add(first_operand, zone());
+  while (has_more() && current() != ']') {
+    if (current() != '&' || Next() != '&') {
+      return ReportError(RegExpError::kInvalidClassSetOperation);
+    }
+    Advance(2);
+    // [lookahead ≠ &]
+    if (current() == '&') {
+      return ReportError(RegExpError::kInvalidCharacterInClass);
+    }
+
+    ClassSetOperandType operand_type;
+    RegExpTree* operand =
+        ParseClassSetOperand(builder, &operand_type CHECK_FAILED);
+    may_contain_strings &= MayContainStrings(operand_type, operand);
+    operands->Add(operand, zone());
+  }
+  if (!has_more()) {
+    return ReportError(RegExpError::kUnterminatedCharacterClass);
+  }
+  if (is_negated && may_contain_strings) {
+    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
+  }
+  DCHECK_EQ(current(), ']');
+  Advance();
+  return zone()->template New<RegExpClassSetExpression>(
+      RegExpClassSetExpression::OperationType::kIntersection, is_negated,
+      may_contain_strings, operands);
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSubtraction
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
+    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
+    ClassSetOperandType first_operand_type) {
+  DCHECK(unicode_sets());
+  DCHECK(current() == '-' && Next() == '-');
+  const bool may_contain_strings =
+      MayContainStrings(first_operand_type, first_operand);
+  if (is_negated && may_contain_strings) {
+    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
+  }
+  ZoneList<RegExpTree*>* operands =
+      zone()->template New<ZoneList<RegExpTree*>>(2, zone());
+  operands->Add(first_operand, zone());
+  while (has_more() && current() != ']') {
+    if (current() != '-' || Next() != '-') {
+      return ReportError(RegExpError::kInvalidClassSetOperation);
+    }
+    Advance(2);
+    ClassSetOperandType dummy;  // unused
+    RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED);
+    operands->Add(operand, zone());
+  }
+  if (!has_more()) {
+    return ReportError(RegExpError::kUnterminatedCharacterClass);
+  }
+  DCHECK_EQ(current(), ']');
+  Advance();
+  return zone()->template New<RegExpClassSetExpression>(
+      RegExpClassSetExpression::OperationType::kSubtraction, is_negated,
+      may_contain_strings, operands);
+}
+
+// https://tc39.es/ecma262/#prod-CharacterClass
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
+    const RegExpBuilder* builder) {
+  DCHECK_EQ(current(), '[');
+  Advance();
+  bool is_negated = false;
+  if (current() == '^') {
+    is_negated = true;
+    Advance();
+  }
+  ZoneList<CharacterRange>* ranges =
+      zone()->template New<ZoneList<CharacterRange>>(2, zone());
+  if (current() == ']') {
+    Advance();
+    RegExpClassRanges::ClassRangesFlags class_ranges_flags;
+    if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
+    return zone()->template New<RegExpClassRanges>(zone(), ranges,
+                                                   class_ranges_flags);
+  }
+
+  if (!unicode_sets()) {
+    bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();
+    ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED);
+    if (!has_more()) {
+      return ReportError(RegExpError::kUnterminatedCharacterClass);
+    }
+    DCHECK_EQ(current(), ']');
+    Advance();
+    RegExpClassRanges::ClassRangesFlags character_class_flags;
+    if (is_negated) character_class_flags = RegExpClassRanges::NEGATED;
+    return zone()->template New<RegExpClassRanges>(zone(), ranges,
+                                                   character_class_flags);
+  } else {
+    ClassSetOperandType operand_type;
+    CharacterClassStrings* strings =
+        zone()->template New<CharacterClassStrings>(zone());
+    RegExpTree* operand = ParseClassSetOperand(builder, &operand_type, ranges,
+                                               strings CHECK_FAILED);
+    switch (current()) {
+      case '-':
+        if (Next() == '-') {
+          if (operand == nullptr) {
+            operand =
+                zone()->template New<RegExpClassSetOperand>(ranges, strings);
+          }
+          return ParseClassSubtraction(builder, is_negated, operand,
+                                       operand_type);
+        }
+        // ClassSetRange is handled in ParseClassUnion().
+        break;
+      case '&':
+        if (Next() == '&') {
+          if (operand == nullptr) {
+            operand =
+                zone()->template New<RegExpClassSetOperand>(ranges, strings);
+          }
+          return ParseClassIntersection(builder, is_negated, operand,
+                                        operand_type);
+        }
+    }
+    return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
+                           strings);
+  }
+}
+
+#undef CHECK_FAILED
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
+  DCHECK_NOT_NULL(result);
+  RegExpTree* tree = ParsePattern();
+
+  if (failed()) {
+    DCHECK_NULL(tree);
+    DCHECK_NE(error_, RegExpError::kNone);
+    result->error = error_;
+    result->error_pos = error_pos_;
+    return false;
+  }
+
+  DCHECK_NOT_NULL(tree);
+  DCHECK_EQ(error_, RegExpError::kNone);
+  if (v8_flags.trace_regexp_parser) {
+    StdoutStream os;
+    tree->Print(os, zone());
+    os << "\n";
+  }
+
+  result->tree = tree;
+  const int capture_count = captures_started();
+  result->simple = tree->IsAtom() && simple() && capture_count == 0;
+  result->contains_anchor = contains_anchor();
+  result->capture_count = capture_count;
+  result->named_captures = GetNamedCaptures();
+  return true;
+}
+
+void RegExpBuilder::FlushText() { text_builder().FlushText(); }
+
+void RegExpBuilder::AddCharacter(base::uc16 c) {
+  pending_empty_ = false;
+  text_builder().AddCharacter(c);
+}
+
+void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
+  pending_empty_ = false;
+  text_builder().AddUnicodeCharacter(c);
+}
+
+void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
+  pending_empty_ = false;
+  text_builder().AddEscapedUnicodeCharacter(character);
+}
+
+void RegExpBuilder::AddEmpty() {
+  text_builder().FlushPendingSurrogate();
+  pending_empty_ = true;
+}
+
+void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) {
+  pending_empty_ = false;
+  text_builder().AddClassRanges(cc);
+}
+
+void RegExpBuilder::AddAtom(RegExpTree* term) {
+  if (term->IsEmpty()) {
+    AddEmpty();
+    return;
+  }
+  pending_empty_ = false;
+  if (term->IsTextElement()) {
+    text_builder().AddAtom(term);
+  } else {
+    FlushText();
+    terms_.emplace_back(term);
+  }
+}
+
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+  DCHECK(!term->IsEmpty());
+  pending_empty_ = false;
+  if (term->IsTextElement()) {
+    text_builder().AddTerm(term);
+  } else {
+    FlushText();
+    terms_.emplace_back(term);
+  }
+}
+
+void RegExpBuilder::AddAssertion(RegExpTree* assert) {
+  FlushText();
+  pending_empty_ = false;
+  terms_.emplace_back(assert);
+}
+
+void RegExpBuilder::NewAlternative() { FlushTerms(); }
+
+void RegExpBuilder::FlushTerms() {
+  FlushText();
+  size_t num_terms = terms_.size();
+  RegExpTree* alternative;
+  if (num_terms == 0) {
+    alternative = zone()->New<RegExpEmpty>();
+  } else if (num_terms == 1) {
+    alternative = terms_.back();
+  } else {
+    alternative =
+        zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
+            base::VectorOf(terms_.begin(), terms_.size()), zone()));
+  }
+  alternatives_.emplace_back(alternative);
+  terms_.clear();
+}
+
+RegExpTree* RegExpBuilder::ToRegExp() {
+  FlushTerms();
+  size_t num_alternatives = alternatives_.size();
+  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
+  if (num_alternatives == 1) return alternatives_.back();
+  return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
+      base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
+}
+
+bool RegExpBuilder::AddQuantifierToAtom(
+    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+  if (pending_empty_) {
+    pending_empty_ = false;
+    return true;
+  }
+  RegExpTree* atom = text_builder().PopLastAtom();
+  if (atom != nullptr) {
+    FlushText();
+  } else if (terms_.size() > 0) {
+    atom = terms_.back();
+    terms_.pop_back();
+    if (atom->IsLookaround()) {
+      // With /u or /v, lookarounds are not quantifiable.
+      if (IsUnicodeMode()) return false;
+      // Lookbehinds are not quantifiable.
+      if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
+        return false;
+      }
+    }
+    if (atom->max_match() == 0) {
+      // Guaranteed to only match an empty string.
+      if (min == 0) {
+        return true;
+      }
+      terms_.emplace_back(atom);
+      return true;
+    }
+  } else {
+    // Only call immediately after adding an atom or character!
+    UNREACHABLE();
+  }
+  terms_.emplace_back(
+      zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));
+  return true;
+}
+
+template class RegExpParserImpl<uint8_t>;
+template class RegExpParserImpl<base::uc16>;
+
+}  // namespace
+
+// static
+bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
+                                             Handle<String> input,
+                                             RegExpFlags flags,
+                                             RegExpCompileData* result) {
+  DisallowGarbageCollection no_gc;
+  uintptr_t stack_limit = isolate->stack_guard()->real_climit();
+  String::FlatContent content = input->GetFlatContent(no_gc);
+  if (content.IsOneByte()) {
+    base::Vector<const uint8_t> v = content.ToOneByteVector();
+    return RegExpParserImpl<uint8_t>{v.begin(),   v.length(), flags,
+                                     stack_limit, zone,       no_gc}
+        .Parse(result);
+  } else {
+    base::Vector<const base::uc16> v = content.ToUC16Vector();
+    return RegExpParserImpl<base::uc16>{v.begin(),   v.length(), flags,
+                                        stack_limit, zone,       no_gc}
+        .Parse(result);
+  }
+}
+
+// static
+template <class CharT>
+bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
+                                      const CharT* input, int input_length,
+                                      RegExpFlags flags,
+                                      RegExpCompileData* result,
+                                      const DisallowGarbageCollection& no_gc) {
+  return RegExpParserImpl<CharT>{input,       input_length, flags,
+                                 stack_limit, zone,         no_gc}
+      .Parse(result);
+}
+
+template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
+    Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
+    const DisallowGarbageCollection&);
+template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
+    Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
+    const DisallowGarbageCollection&);
+
+}  // namespace internal
+}  // namespace v8
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 09:22:09 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 09:22:09 +0000
commit	43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree	620249daf56c0258faa40cbdcf9cfba06de2a846 /js/src/irregexp/imported/regexp-parser.cc
parent	Initial commit. (diff)
download	firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip