summaryrefslogtreecommitdiffstats
path: root/js/src/irregexp/imported/regexp-compiler-tonode.cc
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/irregexp/imported/regexp-compiler-tonode.cc')
-rw-r--r--js/src/irregexp/imported/regexp-compiler-tonode.cc2028
1 files changed, 2028 insertions, 0 deletions
diff --git a/js/src/irregexp/imported/regexp-compiler-tonode.cc b/js/src/irregexp/imported/regexp-compiler-tonode.cc
new file mode 100644
index 0000000000..223274bea5
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-compiler-tonode.cc
@@ -0,0 +1,2028 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-compiler.h"
+
+#include "irregexp/imported/regexp.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "irregexp/imported/special-case.h"
+#include "unicode/locid.h"
+#include "unicode/uniset.h"
+#include "unicode/utypes.h"
+#endif // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
+
+constexpr base::uc32 kMaxCodePoint = 0x10ffff;
+constexpr int kMaxUtf16CodeUnit = 0xffff;
+constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
+constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
+
+// -------------------------------------------------------------------
+// Tree to graph conversion
+
+RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ ZoneList<TextElement>* elms =
+ compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
+ elms->Add(TextElement::Atom(this), compiler->zone());
+ return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
+ on_success);
+}
+
+RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
+ on_success);
+}
+
+namespace {
+
+bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
+ const int* special_class, int length) {
+ length--; // Remove final marker.
+
+ DCHECK_EQ(kRangeEndMarker, special_class[length]);
+ DCHECK_NE(0, ranges->length());
+ DCHECK_NE(0, length);
+ DCHECK_NE(0, special_class[0]);
+
+ if (ranges->length() != (length >> 1) + 1) return false;
+
+ CharacterRange range = ranges->at(0);
+ if (range.from() != 0) return false;
+
+ for (int i = 0; i < length; i += 2) {
+ if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
+ return false;
+ }
+ range = ranges->at((i >> 1) + 1);
+ if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
+ return false;
+ }
+ }
+
+ return range.to() == kMaxCodePoint;
+}
+
+bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
+ int length) {
+ length--; // Remove final marker.
+
+ DCHECK_EQ(kRangeEndMarker, special_class[length]);
+ if (ranges->length() * 2 != length) return false;
+
+ for (int i = 0; i < length; i += 2) {
+ CharacterRange range = ranges->at(i >> 1);
+ if (range.from() != static_cast<base::uc32>(special_class[i]) ||
+ range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace
+
+bool RegExpClassRanges::is_standard(Zone* zone) {
+ // TODO(lrn): Remove need for this function, by not throwing away information
+ // along the way.
+ if (is_negated()) {
+ return false;
+ }
+ if (set_.is_standard()) {
+ return true;
+ }
+ if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
+ return true;
+ }
+ if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
+ return true;
+ }
+ if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
+ kLineTerminatorRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
+ return true;
+ }
+ if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
+ kLineTerminatorRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
+ return true;
+ }
+ if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kWord);
+ return true;
+ }
+ if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kNotWord);
+ return true;
+ }
+ return false;
+}
+
+UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
+ // The unicode range splitter categorizes given character ranges into:
+ // - Code points from the BMP representable by one code unit.
+ // - Code points outside the BMP that need to be split into surrogate pairs.
+ // - Lone lead surrogates.
+ // - Lone trail surrogates.
+ // Lone surrogates are valid code points, even though no actual characters.
+ // They require special matching to make sure we do not split surrogate pairs.
+
+ for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
+}
+
+void UnicodeRangeSplitter::AddRange(CharacterRange range) {
+ static constexpr base::uc32 kBmp1Start = 0;
+ static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
+ static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
+ static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
+
+ // Ends are all inclusive.
+ static_assert(kBmp1Start == 0);
+ static_assert(kBmp1Start < kBmp1End);
+ static_assert(kBmp1End + 1 == kLeadSurrogateStart);
+ static_assert(kLeadSurrogateStart < kLeadSurrogateEnd);
+ static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
+ static_assert(kTrailSurrogateStart < kTrailSurrogateEnd);
+ static_assert(kTrailSurrogateEnd + 1 == kBmp2Start);
+ static_assert(kBmp2Start < kBmp2End);
+ static_assert(kBmp2End + 1 == kNonBmpStart);
+ static_assert(kNonBmpStart < kNonBmpEnd);
+
+ static constexpr base::uc32 kStarts[] = {
+ kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
+ kBmp2Start, kNonBmpStart,
+ };
+
+ static constexpr base::uc32 kEnds[] = {
+ kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
+ };
+
+ CharacterRangeVector* const kTargets[] = {
+ &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
+ };
+
+ static constexpr int kCount = arraysize(kStarts);
+ static_assert(kCount == arraysize(kEnds));
+ static_assert(kCount == arraysize(kTargets));
+
+ for (int i = 0; i < kCount; i++) {
+ if (kStarts[i] > range.to()) break;
+ const base::uc32 from = std::max(kStarts[i], range.from());
+ const base::uc32 to = std::min(kEnds[i], range.to());
+ if (from > to) continue;
+ kTargets[i]->emplace_back(CharacterRange::Range(from, to));
+ }
+}
+
+namespace {
+
+// Translates between new and old V8-isms (SmallVector, ZoneList).
+ZoneList<CharacterRange>* ToCanonicalZoneList(
+ const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
+ if (v->empty()) return nullptr;
+
+ ZoneList<CharacterRange>* result =
+ zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
+ for (size_t i = 0; i < v->size(); i++) {
+ result->Add(v->at(i), zone);
+ }
+
+ CharacterRange::Canonicalize(result);
+ return result;
+}
+
+void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* bmp =
+ ToCanonicalZoneList(splitter->bmp(), compiler->zone());
+ if (bmp == nullptr) return;
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
+ compiler->zone(), bmp, compiler->read_backward(), on_success)));
+}
+
+using UC16Range = uint32_t; // {from, to} packed into one uint32_t.
+constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
+ return (static_cast<uint32_t>(from) << 16) | to;
+}
+constexpr base::uc16 ExtractFrom(UC16Range r) {
+ return static_cast<base::uc16>(r >> 16);
+}
+constexpr base::uc16 ExtractTo(UC16Range r) {
+ return static_cast<base::uc16>(r);
+}
+
+void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ DCHECK(!compiler->one_byte());
+ Zone* const zone = compiler->zone();
+ ZoneList<CharacterRange>* non_bmp =
+ ToCanonicalZoneList(splitter->non_bmp(), zone);
+ if (non_bmp == nullptr) return;
+
+ // Translate each 32-bit code point range into the corresponding 16-bit code
+ // unit representation consisting of the lead- and trail surrogate.
+ //
+ // The generated alternatives are grouped by the leading surrogate to avoid
+ // emitting excessive code. For example, for
+ //
+ // { \ud800[\udc00-\udc01]
+ // , \ud800[\udc05-\udc06]
+ // }
+ //
+ // there's no need to emit matching code for the leading surrogate \ud800
+ // twice. We also create a dedicated grouping for full trailing ranges, i.e.
+ // [dc00-dfff].
+ ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
+ zone);
+ ZoneList<CharacterRange>* leading_with_full_trailing_range =
+ zone->New<ZoneList<CharacterRange>>(1, zone);
+ const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
+ base::uc16 from_t, base::uc16 to_t) {
+ const UC16Range leading_range = ToUC16Range(from_l, to_l);
+ if (grouped_by_leading.count(leading_range) == 0) {
+ if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
+ leading_with_full_trailing_range->Add(
+ CharacterRange::Range(from_l, to_l), zone);
+ return;
+ }
+ grouped_by_leading[leading_range] =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ }
+ grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
+ zone);
+ };
+
+ // First, create the grouped ranges.
+ CharacterRange::Canonicalize(non_bmp);
+ for (int i = 0; i < non_bmp->length(); i++) {
+ // Match surrogate pair.
+ // E.g. [\u10005-\u11005] becomes
+ // \ud800[\udc05-\udfff]|
+ // [\ud801-\ud803][\udc00-\udfff]|
+ // \ud804[\udc00-\udc05]
+ base::uc32 from = non_bmp->at(i).from();
+ base::uc32 to = non_bmp->at(i).to();
+ base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
+ base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
+ base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
+ base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
+
+ if (from_l == to_l) {
+ // The lead surrogate is the same.
+ AddRange(from_l, to_l, from_t, to_t);
+ continue;
+ }
+
+ if (from_t != kTrailSurrogateStart) {
+ // Add [from_l][from_t-\udfff].
+ AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
+ from_l++;
+ }
+ if (to_t != kTrailSurrogateEnd) {
+ // Add [to_l][\udc00-to_t].
+ AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
+ to_l--;
+ }
+ if (from_l <= to_l) {
+ // Add [from_l-to_l][\udc00-\udfff].
+ AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
+ }
+ }
+
+ // Create the actual TextNode now that ranges are fully grouped.
+ if (!leading_with_full_trailing_range->is_empty()) {
+ CharacterRange::Canonicalize(leading_with_full_trailing_range);
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, leading_with_full_trailing_range,
+ CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
+ compiler->read_backward(), on_success)));
+ }
+ for (const auto& it : grouped_by_leading) {
+ CharacterRange leading_range =
+ CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
+ ZoneList<CharacterRange>* trailing_ranges = it.second;
+ CharacterRange::Canonicalize(trailing_ranges);
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, leading_range, trailing_ranges, compiler->read_backward(),
+ on_success)));
+ }
+}
+
+RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
+ RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
+ ZoneList<CharacterRange>* match, RegExpNode* on_success,
+ bool read_backward) {
+ Zone* zone = compiler->zone();
+ RegExpNode* match_node = TextNode::CreateForCharacterRanges(
+ zone, match, read_backward, on_success);
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpLookaround::Builder lookaround(false, match_node, stack_register,
+ position_register);
+ RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+ zone, lookbehind, !read_backward, lookaround.on_match_success());
+ return lookaround.ForMatch(negative_match);
+}
+
+RegExpNode* MatchAndNegativeLookaroundInReadDirection(
+ RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
+ ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
+ bool read_backward) {
+ Zone* zone = compiler->zone();
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpLookaround::Builder lookaround(false, on_success, stack_register,
+ position_register);
+ RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+ zone, lookahead, read_backward, lookaround.on_match_success());
+ return TextNode::CreateForCharacterRanges(
+ zone, match, read_backward, lookaround.ForMatch(negative_match));
+}
+
+void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* lead_surrogates =
+ ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
+ if (lead_surrogates == nullptr) return;
+ Zone* zone = compiler->zone();
+ // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
+ ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+ RegExpNode* match;
+ if (compiler->read_backward()) {
+ // Reading backward. Assert that reading forward, there is no trail
+ // surrogate, and then backward match the lead surrogate.
+ match = NegativeLookaroundAgainstReadDirectionAndMatch(
+ compiler, trail_surrogates, lead_surrogates, on_success, true);
+ } else {
+ // Reading forward. Forward match the lead surrogate and assert that
+ // no trail surrogate follows.
+ match = MatchAndNegativeLookaroundInReadDirection(
+ compiler, lead_surrogates, trail_surrogates, on_success, false);
+ }
+ result->AddAlternative(GuardedAlternative(match));
+}
+
+void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* trail_surrogates =
+ ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
+ if (trail_surrogates == nullptr) return;
+ Zone* zone = compiler->zone();
+ // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
+ ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+
+ RegExpNode* match;
+ if (compiler->read_backward()) {
+ // Reading backward. Backward match the trail surrogate and assert that no
+ // lead surrogate precedes it.
+ match = MatchAndNegativeLookaroundInReadDirection(
+ compiler, trail_surrogates, lead_surrogates, on_success, true);
+ } else {
+ // Reading forward. Assert that reading backward, there is no lead
+ // surrogate, and then forward match the trail surrogate.
+ match = NegativeLookaroundAgainstReadDirectionAndMatch(
+ compiler, lead_surrogates, trail_surrogates, on_success, false);
+ }
+ result->AddAlternative(GuardedAlternative(match));
+}
+
+RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
+ DCHECK(!compiler->read_backward());
+ Zone* zone = compiler->zone();
+ // Advance any character. If the character happens to be a lead surrogate and
+ // we advanced into the middle of a surrogate pair, it will work out, as
+ // nothing will match from there. We will have to advance again, consuming
+ // the associated trail surrogate.
+ ZoneList<CharacterRange>* range =
+ CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));
+ return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
+}
+
+} // namespace
+
+#ifdef V8_INTL_SUPPORT
+// static
+void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {
+ // Remove characters for which closeOver() adds full-case-folding equivalents
+ // because we should work only with simple case folding mappings.
+ icu::UnicodeSet non_simple = icu::UnicodeSet(set);
+ non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());
+ set.removeAll(non_simple);
+
+ set.closeOver(USET_CASE_INSENSITIVE);
+ // Full case folding maps single characters to multiple characters.
+ // Those are represented as strings in the set. Remove them so that
+ // we end up with only simple and common case mappings.
+ set.removeAllStrings();
+
+ // Add characters that have non-simple case foldings again (they match
+ // themselves).
+ set.addAll(non_simple);
+}
+#endif // V8_INTL_SUPPORT
+
+// static
+void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
+ Zone* zone) {
+#ifdef V8_INTL_SUPPORT
+ DCHECK(IsCanonical(ranges));
+
+ // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
+ // See also https://crbug.com/v8/6727.
+ // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
+ // which we use frequently internally. But large ranges can also easily be
+ // created by the user. We might want to have a more general caching mechanism
+ // for such ranges.
+ if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
+
+ // Use ICU to compute the case fold closure over the ranges.
+ icu::UnicodeSet set;
+ for (int i = 0; i < ranges->length(); i++) {
+ set.add(ranges->at(i).from(), ranges->at(i).to());
+ }
+ // Clear the ranges list without freeing the backing store.
+ ranges->Rewind(0);
+
+ UnicodeSimpleCloseOver(set);
+ for (int i = 0; i < set.getRangeCount(); i++) {
+ ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
+ }
+ // No errors and everything we collected have been ranges.
+ Canonicalize(ranges);
+#endif // V8_INTL_SUPPORT
+}
+
+RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ set_.Canonicalize();
+ Zone* const zone = compiler->zone();
+ ZoneList<CharacterRange>* ranges = this->ranges(zone);
+
+ if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
+ CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);
+ }
+
+ if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||
+ contains_split_surrogate()) {
+ return zone->New<TextNode>(this, compiler->read_backward(), on_success);
+ }
+
+ if (is_negated()) {
+ // With /v, character classes are never negated.
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom
+ // Atom :: CharacterClass
+ // 4. Assert: cc.[[Invert]] is false.
+ // Instead the complement is created when evaluating the class set.
+ // The only exception is the "nothing range" (negated everything), which is
+ // internally created for an empty set.
+ DCHECK_IMPLIES(
+ IsUnicodeSets(compiler->flags()),
+ ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));
+ ZoneList<CharacterRange>* negated =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::Negate(ranges, negated, zone);
+ ranges = negated;
+ }
+
+ if (ranges->length() == 0) {
+ // The empty character class is used as a 'fail' node.
+ RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges);
+ return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
+ }
+
+ if (set_.is_standard() &&
+ standard_type() == StandardCharacterSet::kEverything) {
+ return UnanchoredAdvance(compiler, on_success);
+ }
+
+ // Split ranges in order to handle surrogates correctly:
+ // - Surrogate pairs: translate the 32-bit code point into two uc16 code
+ // units (irregexp operates only on code units).
+ // - Lone surrogates: these require lookarounds to ensure we don't match in
+ // the middle of a surrogate pair.
+ ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
+ UnicodeRangeSplitter splitter(ranges);
+ AddBmpCharacters(compiler, result, on_success, &splitter);
+ AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
+ AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
+ AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
+
+ static constexpr int kMaxRangesToInline = 32; // Arbitrary.
+ if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
+
+ return result;
+}
+
+RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ Zone* zone = compiler->zone();
+ const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) +
+ (ranges()->is_empty() ? 0 : 1);
+ if (size == 0) {
+ // If neither ranges nor strings are present, the operand is equal to an
+ // empty range (matching nothing).
+ ZoneList<CharacterRange>* empty =
+ zone->template New<ZoneList<CharacterRange>>(0, zone);
+ return zone->template New<RegExpClassRanges>(zone, empty)
+ ->ToNode(compiler, on_success);
+ }
+ ZoneList<RegExpTree*>* alternatives =
+ zone->template New<ZoneList<RegExpTree*>>(size, zone);
+ // Strings are sorted by length first (larger strings before shorter ones).
+ // See the comment on CharacterClassStrings.
+ // Empty strings (if present) are added after character ranges.
+ RegExpTree* empty_string = nullptr;
+ if (has_strings()) {
+ for (auto string : *strings()) {
+ if (string.second->IsEmpty()) {
+ empty_string = string.second;
+ } else {
+ alternatives->Add(string.second, zone);
+ }
+ }
+ }
+ if (!ranges()->is_empty()) {
+ alternatives->Add(zone->template New<RegExpClassRanges>(zone, ranges()),
+ zone);
+ }
+ if (empty_string != nullptr) {
+ alternatives->Add(empty_string, zone);
+ }
+
+ RegExpTree* node = nullptr;
+ if (size == 1) {
+ DCHECK_EQ(alternatives->length(), 1);
+ node = alternatives->first();
+ } else {
+ node = zone->template New<RegExpDisjunction>(alternatives);
+ }
+ return node->ToNode(compiler, on_success);
+}
+
+RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* temp_ranges =
+ zone->template New<ZoneList<CharacterRange>>(4, zone);
+ RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone);
+ return root->ToNode(compiler, on_success);
+}
+
+void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {
+ ranges()->AddAll(*other->ranges(), zone);
+ strings()->insert(other->strings()->begin(), other->strings()->end());
+}
+
+void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
+ ZoneList<CharacterRange>* temp_ranges,
+ Zone* zone) {
+ CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
+ std::swap(*ranges(), *temp_ranges);
+ temp_ranges->Rewind(0);
+ for (auto iter = strings()->begin(); iter != strings()->end();) {
+ if (other->strings()->find(iter->first) == other->strings()->end()) {
+ iter = strings()->erase(iter);
+ } else {
+ iter++;
+ }
+ }
+}
+
+void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
+ ZoneList<CharacterRange>* temp_ranges,
+ Zone* zone) {
+ CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
+ std::swap(*ranges(), *temp_ranges);
+ temp_ranges->Rewind(0);
+ for (auto iter = strings()->begin(); iter != strings()->end();) {
+ if (other->strings()->find(iter->first) != other->strings()->end()) {
+ iter = strings()->erase(iter);
+ } else {
+ iter++;
+ }
+ }
+}
+
+// static
+RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(
+ RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
+ DCHECK(temp_ranges->is_empty());
+ if (root->IsClassSetOperand()) {
+ return root->AsClassSetOperand();
+ }
+ DCHECK(root->IsClassSetExpression());
+ RegExpClassSetExpression* node = root->AsClassSetExpression();
+ RegExpClassSetOperand* result =
+ ComputeExpression(node->operands()->at(0), temp_ranges, zone);
+ switch (node->operation()) {
+ case OperationType::kUnion: {
+ for (int i = 1; i < node->operands()->length(); i++) {
+ RegExpClassSetOperand* op =
+ ComputeExpression(node->operands()->at(i), temp_ranges, zone);
+ result->Union(op, zone);
+ }
+ CharacterRange::Canonicalize(result->ranges());
+ break;
+ }
+ case OperationType::kIntersection: {
+ for (int i = 1; i < node->operands()->length(); i++) {
+ RegExpClassSetOperand* op =
+ ComputeExpression(node->operands()->at(i), temp_ranges, zone);
+ result->Intersect(op, temp_ranges, zone);
+ }
+ break;
+ }
+ case OperationType::kSubtraction: {
+ for (int i = 1; i < node->operands()->length(); i++) {
+ RegExpClassSetOperand* op =
+ ComputeExpression(node->operands()->at(i), temp_ranges, zone);
+ result->Subtract(op, temp_ranges, zone);
+ }
+ break;
+ }
+ }
+ if (node->is_negated()) {
+ DCHECK(!result->has_strings());
+ CharacterRange::Negate(result->ranges(), temp_ranges, zone);
+ std::swap(*result->ranges(), *temp_ranges);
+ temp_ranges->Rewind(0);
+ }
+ // Store the result as single operand of the current node.
+ node->operands()->Set(0, result);
+ node->operands()->Rewind(1);
+
+ return result;
+}
+
+namespace {
+
+int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ base::uc16 character1 = atom1->data().at(0);
+ base::uc16 character2 = atom2->data().at(0);
+ if (character1 < character2) return -1;
+ if (character1 > character2) return 1;
+ return 0;
+}
+
+#ifdef V8_INTL_SUPPORT
+
+int CompareCaseInsensitive(const icu::UnicodeString& a,
+ const icu::UnicodeString& b) {
+ return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
+}
+
+int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
+ RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
+ icu::UnicodeString{atom2->data().at(0)});
+}
+
+bool Equals(bool ignore_case, const icu::UnicodeString& a,
+ const icu::UnicodeString& b) {
+ if (a == b) return true;
+ if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
+ return false; // Case-sensitive equality already checked above.
+}
+
+bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
+ const RegExpAtom* b) {
+ return Equals(ignore_case, a->data().at(index), b->data().at(index));
+}
+
+#else
+
+unibrow::uchar Canonical(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar c) {
+ unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
+ int length = canonicalize->get(c, '\0', chars);
+ DCHECK_LE(length, 1);
+ unibrow::uchar canonical = c;
+ if (length == 1) canonical = chars[0];
+ return canonical;
+}
+
+int CompareCaseInsensitive(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar a, unibrow::uchar b) {
+ if (a == b) return 0;
+ if (a >= 'a' || b >= 'a') {
+ a = Canonical(canonicalize, a);
+ b = Canonical(canonicalize, b);
+ }
+ return static_cast<int>(a) - static_cast<int>(b);
+}
+
+int CompareFirstCharCaseInsensitive(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ RegExpTree* const* a, RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
+ atom2->data().at(0));
+}
+
+bool Equals(bool ignore_case,
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar a, unibrow::uchar b) {
+ if (a == b) return true;
+ if (ignore_case) {
+ return CompareCaseInsensitive(canonicalize, a, b) == 0;
+ }
+ return false; // Case-sensitive equality already checked above.
+}
+
+bool CharAtEquals(bool ignore_case,
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ int index, const RegExpAtom* a, const RegExpAtom* b) {
+ return Equals(ignore_case, canonicalize, a->data().at(index),
+ b->data().at(index));
+}
+
+#endif // V8_INTL_SUPPORT
+
+} // namespace
+
+// We can stable sort runs of atoms, since the order does not matter if they
+// start with different characters.
+// Returns true if any consecutive atoms were found.
+bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+ bool found_consecutive_atoms = false;
+ for (int i = 0; i < length; i++) {
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (alternative->IsAtom()) break;
+ i++;
+ }
+ // i is length or it is the index of an atom.
+ if (i == length) break;
+ int first_atom = i;
+ i++;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ i++;
+ }
+ // Sort atoms to get ones with common prefixes together.
+ // This step is more tricky if we are in a case-independent regexp,
+ // because it would change /is|I/ to /I|is/, and order matters when
+ // the regexp parts don't match only disjoint starting points. To fix
+ // this we have a version of CompareFirstChar that uses case-
+ // independent character classes for comparison.
+ DCHECK_LT(first_atom, alternatives->length());
+ DCHECK_LE(i, alternatives->length());
+ DCHECK_LE(first_atom, i);
+ if (IsIgnoreCase(compiler->flags())) {
+#ifdef V8_INTL_SUPPORT
+ alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
+ i - first_atom);
+#else
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+ compiler->isolate()->regexp_macro_assembler_canonicalize();
+ auto compare_closure = [canonicalize](RegExpTree* const* a,
+ RegExpTree* const* b) {
+ return CompareFirstCharCaseInsensitive(canonicalize, a, b);
+ };
+ alternatives->StableSort(compare_closure, first_atom, i - first_atom);
+#endif // V8_INTL_SUPPORT
+ } else {
+ alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
+ }
+ if (i - first_atom > 1) found_consecutive_atoms = true;
+ }
+ return found_consecutive_atoms;
+}
+
+// Optimizes ab|ac|az to a(?:b|c|d).
+void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
+ Zone* zone = compiler->zone();
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+ const bool ignore_case = IsIgnoreCase(compiler->flags());
+
+ int write_posn = 0;
+ int i = 0;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ RegExpAtom* const atom = alternative->AsAtom();
+#ifdef V8_INTL_SUPPORT
+ icu::UnicodeString common_prefix(atom->data().at(0));
+#else
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
+ compiler->isolate()->regexp_macro_assembler_canonicalize();
+ unibrow::uchar common_prefix = atom->data().at(0);
+ if (ignore_case) {
+ common_prefix = Canonical(canonicalize, common_prefix);
+ }
+#endif // V8_INTL_SUPPORT
+ int first_with_prefix = i;
+ int prefix_length = atom->length();
+ i++;
+ while (i < length) {
+ alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ RegExpAtom* const alt_atom = alternative->AsAtom();
+#ifdef V8_INTL_SUPPORT
+ icu::UnicodeString new_prefix(alt_atom->data().at(0));
+ if (!Equals(ignore_case, new_prefix, common_prefix)) break;
+#else
+ unibrow::uchar new_prefix = alt_atom->data().at(0);
+ if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
+#endif // V8_INTL_SUPPORT
+ prefix_length = std::min(prefix_length, alt_atom->length());
+ i++;
+ }
+ if (i > first_with_prefix + 2) {
+ // Found worthwhile run of alternatives with common prefix of at least one
+ // character. The sorting function above did not sort on more than one
+ // character for reasons of correctness, but there may still be a longer
+ // common prefix if the terms were similar or presorted in the input.
+ // Find out how long the common prefix is.
+ int run_length = i - first_with_prefix;
+ RegExpAtom* const alt_atom =
+ alternatives->at(first_with_prefix)->AsAtom();
+ for (int j = 1; j < run_length && prefix_length > 1; j++) {
+ RegExpAtom* old_atom =
+ alternatives->at(j + first_with_prefix)->AsAtom();
+ for (int k = 1; k < prefix_length; k++) {
+#ifdef V8_INTL_SUPPORT
+ if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
+#else
+ if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
+#endif // V8_INTL_SUPPORT
+ prefix_length = k;
+ break;
+ }
+ }
+ }
+ RegExpAtom* prefix =
+ zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length));
+ ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
+ pair->Add(prefix, zone);
+ ZoneList<RegExpTree*>* suffixes =
+ zone->New<ZoneList<RegExpTree*>>(run_length, zone);
+ for (int j = 0; j < run_length; j++) {
+ RegExpAtom* old_atom =
+ alternatives->at(j + first_with_prefix)->AsAtom();
+ int len = old_atom->length();
+ if (len == prefix_length) {
+ suffixes->Add(zone->New<RegExpEmpty>(), zone);
+ } else {
+ RegExpTree* suffix = zone->New<RegExpAtom>(
+ old_atom->data().SubVector(prefix_length, old_atom->length()));
+ suffixes->Add(suffix, zone);
+ }
+ }
+ pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
+ alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
+ } else {
+ // Just copy any non-worthwhile alternatives.
+ for (int j = first_with_prefix; j < i; j++) {
+ alternatives->at(write_posn++) = alternatives->at(j);
+ }
+ }
+ }
+ alternatives->Rewind(write_posn); // Trim end of array.
+}
+
+// Optimizes b|c|z to [bcz].
+void RegExpDisjunction::FixSingleCharacterDisjunctions(
+ RegExpCompiler* compiler) {
+ Zone* zone = compiler->zone();
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+
+ int write_posn = 0;
+ int i = 0;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ RegExpAtom* const atom = alternative->AsAtom();
+ if (atom->length() != 1) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ const RegExpFlags flags = compiler->flags();
+ DCHECK_IMPLIES(IsEitherUnicode(flags),
+ !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
+ bool contains_trail_surrogate =
+ unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
+ int first_in_run = i;
+ i++;
+ // Find a run of single-character atom alternatives that have identical
+ // flags (case independence and unicode-ness).
+ while (i < length) {
+ alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ RegExpAtom* const alt_atom = alternative->AsAtom();
+ if (alt_atom->length() != 1) break;
+ DCHECK_IMPLIES(IsEitherUnicode(flags),
+ !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
+ contains_trail_surrogate |=
+ unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));
+ i++;
+ }
+ if (i > first_in_run + 1) {
+ // Found non-trivial run of single-character alternatives.
+ int run_length = i - first_in_run;
+ ZoneList<CharacterRange>* ranges =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ for (int j = 0; j < run_length; j++) {
+ RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
+ DCHECK_EQ(old_atom->length(), 1);
+ ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
+ }
+ RegExpClassRanges::ClassRangesFlags class_ranges_flags;
+ if (IsEitherUnicode(flags) && contains_trail_surrogate) {
+ class_ranges_flags = RegExpClassRanges::CONTAINS_SPLIT_SURROGATE;
+ }
+ alternatives->at(write_posn++) =
+ zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags);
+ } else {
+ // Just copy any trivial alternatives.
+ for (int j = first_in_run; j < i; j++) {
+ alternatives->at(write_posn++) = alternatives->at(j);
+ }
+ }
+ }
+ alternatives->Rewind(write_posn); // Trim end of array.
+}
+
+RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ compiler->ToNodeMaybeCheckForStackOverflow();
+
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+
+ if (alternatives->length() > 2) {
+ bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
+ if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
+ FixSingleCharacterDisjunctions(compiler);
+ if (alternatives->length() == 1) {
+ return alternatives->at(0)->ToNode(compiler, on_success);
+ }
+ }
+
+ int length = alternatives->length();
+
+ ChoiceNode* result =
+ compiler->zone()->New<ChoiceNode>(length, compiler->zone());
+ for (int i = 0; i < length; i++) {
+ GuardedAlternative alternative(
+ alternatives->at(i)->ToNode(compiler, on_success));
+ result->AddAlternative(alternative);
+ }
+ return result;
+}
+
+RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
+}
+
+namespace {
+// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
+// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
+RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpAssertion::Type type,
+ RegExpFlags flags) {
+ CHECK(NeedsUnicodeCaseEquivalents(flags));
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* word_range =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
+ zone);
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
+ // Add two choices. The (non-)boundary could start with a word or
+ // a non-word-character.
+ for (int i = 0; i < 2; i++) {
+ bool lookbehind_for_word = i == 0;
+ bool lookahead_for_word =
+ (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
+ // Look to the left.
+ RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
+ stack_register, position_register);
+ RegExpNode* backward = TextNode::CreateForCharacterRanges(
+ zone, word_range, true, lookbehind.on_match_success());
+ // Look to the right.
+ RegExpLookaround::Builder lookahead(lookahead_for_word,
+ lookbehind.ForMatch(backward),
+ stack_register, position_register);
+ RegExpNode* forward = TextNode::CreateForCharacterRanges(
+ zone, word_range, false, lookahead.on_match_success());
+ result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
+ }
+ return result;
+}
+} // anonymous namespace
+
+RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ NodeInfo info;
+ Zone* zone = compiler->zone();
+
+ switch (assertion_type()) {
+ case Type::START_OF_LINE:
+ return AssertionNode::AfterNewline(on_success);
+ case Type::START_OF_INPUT:
+ return AssertionNode::AtStart(on_success);
+ case Type::BOUNDARY:
+ return NeedsUnicodeCaseEquivalents(compiler->flags())
+ ? BoundaryAssertionAsLookaround(
+ compiler, on_success, Type::BOUNDARY, compiler->flags())
+ : AssertionNode::AtBoundary(on_success);
+ case Type::NON_BOUNDARY:
+ return NeedsUnicodeCaseEquivalents(compiler->flags())
+ ? BoundaryAssertionAsLookaround(compiler, on_success,
+ Type::NON_BOUNDARY,
+ compiler->flags())
+ : AssertionNode::AtNonBoundary(on_success);
+ case Type::END_OF_INPUT:
+ return AssertionNode::AtEnd(on_success);
+ case Type::END_OF_LINE: {
+ // Compile $ in multiline regexps as an alternation with a positive
+ // lookahead in one side and an end-of-input on the other side.
+ // We need two registers for the lookahead.
+ int stack_pointer_register = compiler->AllocateRegister();
+ int position_register = compiler->AllocateRegister();
+ // The ChoiceNode to distinguish between a newline and end-of-input.
+ ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
+ // Create a newline atom.
+ ZoneList<CharacterRange>* newline_ranges =
+ zone->New<ZoneList<CharacterRange>>(3, zone);
+ CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
+ newline_ranges, false, zone);
+ RegExpClassRanges* newline_atom =
+ zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator);
+ TextNode* newline_matcher =
+ zone->New<TextNode>(newline_atom, false,
+ ActionNode::PositiveSubmatchSuccess(
+ stack_pointer_register, position_register,
+ 0, // No captures inside.
+ -1, // Ignored if no captures.
+ on_success));
+ // Create an end-of-input matcher.
+ RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
+ stack_pointer_register, position_register, newline_matcher);
+ // Add the two alternatives to the ChoiceNode.
+ GuardedAlternative eol_alternative(end_of_line);
+ result->AddAlternative(eol_alternative);
+ GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
+ result->AddAlternative(end_alternative);
+ return result;
+ }
+ default:
+ UNREACHABLE();
+ }
+}
+
+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return compiler->zone()->New<BackReferenceNode>(
+ RegExpCapture::StartRegister(index()),
+ RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
+ on_success);
+}
+
+RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return on_success;
+}
+
+RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return body_->ToNode(compiler, on_success);
+}
+
+RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
+ int stack_pointer_register,
+ int position_register,
+ int capture_register_count,
+ int capture_register_start)
+ : is_positive_(is_positive),
+ on_success_(on_success),
+ stack_pointer_register_(stack_pointer_register),
+ position_register_(position_register) {
+ if (is_positive_) {
+ on_match_success_ = ActionNode::PositiveSubmatchSuccess(
+ stack_pointer_register, position_register, capture_register_count,
+ capture_register_start, on_success_);
+ } else {
+ Zone* zone = on_success_->zone();
+ on_match_success_ = zone->New<NegativeSubmatchSuccess>(
+ stack_pointer_register, position_register, capture_register_count,
+ capture_register_start, zone);
+ }
+}
+
+RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
+ if (is_positive_) {
+ return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
+ position_register_, match);
+ } else {
+ Zone* zone = on_success_->zone();
+ // We use a ChoiceNode to represent the negative lookaround. The first
+ // alternative is the negative match. On success, the end node backtracks.
+ // On failure, the second alternative is tried and leads to success.
+ // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
+ // first exit when calculating quick checks.
+ ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
+ GuardedAlternative(match), GuardedAlternative(on_success_), zone);
+ return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
+ position_register_, choice_node);
+ }
+}
+
+RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ int stack_pointer_register = compiler->AllocateRegister();
+ int position_register = compiler->AllocateRegister();
+
+ const int registers_per_capture = 2;
+ const int register_of_first_capture = 2;
+ int register_count = capture_count_ * registers_per_capture;
+ int register_start =
+ register_of_first_capture + capture_from_ * registers_per_capture;
+
+ RegExpNode* result;
+ bool was_reading_backward = compiler->read_backward();
+ compiler->set_read_backward(type() == LOOKBEHIND);
+ Builder builder(is_positive(), on_success, stack_pointer_register,
+ position_register, register_count, register_start);
+ RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
+ result = builder.ForMatch(match);
+ compiler->set_read_backward(was_reading_backward);
+ return result;
+}
+
+RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return ToNode(body(), index(), compiler, on_success);
+}
+
+RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
+ RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ DCHECK_NOT_NULL(body);
+ int start_reg = RegExpCapture::StartRegister(index);
+ int end_reg = RegExpCapture::EndRegister(index);
+ if (compiler->read_backward()) std::swap(start_reg, end_reg);
+ RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
+ RegExpNode* body_node = body->ToNode(compiler, store_end);
+ return ActionNode::StorePosition(start_reg, true, body_node);
+}
+
+namespace {
+
+class AssertionSequenceRewriter final {
+ public:
+ // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
+ // instead of sprinkling rewrites into the AST->Node conversion process.
+ static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
+ AssertionSequenceRewriter rewriter(terms, zone);
+
+ static constexpr int kNoIndex = -1;
+ int from = kNoIndex;
+
+ for (int i = 0; i < terms->length(); i++) {
+ RegExpTree* t = terms->at(i);
+ if (from == kNoIndex && t->IsAssertion()) {
+ from = i; // Start a sequence.
+ } else if (from != kNoIndex && !t->IsAssertion()) {
+ // Terminate and process the sequence.
+ if (i - from > 1) rewriter.Rewrite(from, i);
+ from = kNoIndex;
+ }
+ }
+
+ if (from != kNoIndex && terms->length() - from > 1) {
+ rewriter.Rewrite(from, terms->length());
+ }
+ }
+
+ // All assertions are zero width. A consecutive sequence of assertions is
+ // order-independent. There's two ways we can optimize here:
+ // 1. fold all identical assertions.
+ // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
+ // sequence fails.
+ void Rewrite(int from, int to) {
+ DCHECK_GT(to, from + 1);
+
+ // Bitfield of all seen assertions.
+ uint32_t seen_assertions = 0;
+ static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
+ kUInt32Size * kBitsPerByte);
+
+ for (int i = from; i < to; i++) {
+ RegExpAssertion* t = terms_->at(i)->AsAssertion();
+ const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
+
+ if (seen_assertions & bit) {
+ // Fold duplicates.
+ terms_->Set(i, zone_->New<RegExpEmpty>());
+ }
+
+ seen_assertions |= bit;
+ }
+
+ // Collapse failures.
+ const uint32_t always_fails_mask =
+ 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
+ 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
+ if ((seen_assertions & always_fails_mask) == always_fails_mask) {
+ ReplaceSequenceWithFailure(from, to);
+ }
+ }
+
+ void ReplaceSequenceWithFailure(int from, int to) {
+ // Replace the entire sequence with a single node that always fails.
+ // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
+ // negated '*' (everything) range serves the purpose.
+ ZoneList<CharacterRange>* ranges =
+ zone_->New<ZoneList<CharacterRange>>(0, zone_);
+ RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges);
+ terms_->Set(from, cc);
+
+ // Zero out the rest.
+ RegExpEmpty* empty = zone_->New<RegExpEmpty>();
+ for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
+ }
+
+ private:
+ AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
+ : zone_(zone), terms_(terms) {}
+
+ Zone* zone_;
+ ZoneList<RegExpTree*>* terms_;
+};
+
+} // namespace
+
+RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ compiler->ToNodeMaybeCheckForStackOverflow();
+
+ ZoneList<RegExpTree*>* children = nodes();
+
+ AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
+
+ RegExpNode* current = on_success;
+ if (compiler->read_backward()) {
+ for (int i = 0; i < children->length(); i++) {
+ current = children->at(i)->ToNode(compiler, current);
+ }
+ } else {
+ for (int i = children->length() - 1; i >= 0; i--) {
+ current = children->at(i)->ToNode(compiler, current);
+ }
+ }
+ return current;
+}
+
+namespace {
+
+void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
+ Zone* zone) {
+ elmc--;
+ DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
+ for (int i = 0; i < elmc; i += 2) {
+ DCHECK(elmv[i] < elmv[i + 1]);
+ ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
+ }
+}
+
+void AddClassNegated(const int* elmv, int elmc,
+ ZoneList<CharacterRange>* ranges, Zone* zone) {
+ elmc--;
+ DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
+ DCHECK_NE(0x0000, elmv[0]);
+ DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
+ base::uc16 last = 0x0000;
+ for (int i = 0; i < elmc; i += 2) {
+ DCHECK(last <= elmv[i] - 1);
+ DCHECK(elmv[i] < elmv[i + 1]);
+ ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
+ last = elmv[i + 1];
+ }
+ ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
+}
+
+} // namespace
+
+void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
+ ZoneList<CharacterRange>* ranges,
+ bool add_unicode_case_equivalents,
+ Zone* zone) {
+ if (add_unicode_case_equivalents &&
+ (standard_character_set == StandardCharacterSet::kWord ||
+ standard_character_set == StandardCharacterSet::kNotWord)) {
+ // See #sec-runtime-semantics-wordcharacters-abstract-operation
+ // In case of unicode and ignore_case, we need to create the closure over
+ // case equivalent characters before negating.
+ ZoneList<CharacterRange>* new_ranges =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
+ AddUnicodeCaseEquivalents(new_ranges, zone);
+ if (standard_character_set == StandardCharacterSet::kNotWord) {
+ ZoneList<CharacterRange>* negated =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::Negate(new_ranges, negated, zone);
+ new_ranges = negated;
+ }
+ ranges->AddAll(*new_ranges, zone);
+ return;
+ }
+
+ switch (standard_character_set) {
+ case StandardCharacterSet::kWhitespace:
+ AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotWhitespace:
+ AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kWord:
+ AddClass(kWordRanges, kWordRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotWord:
+ AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kDigit:
+ AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotDigit:
+ AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
+ break;
+ // This is the set of characters matched by the $ and ^ symbols
+ // in multiline mode.
+ case StandardCharacterSet::kLineTerminator:
+ AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotLineTerminator:
+ AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
+ zone);
+ break;
+ // This is not a character range as defined by the spec but a
+ // convenient shorthand for a character class that matches any
+ // character.
+ case StandardCharacterSet::kEverything:
+ ranges->Add(CharacterRange::Everything(), zone);
+ break;
+ }
+}
+
+// static
+void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool is_one_byte) {
+ CharacterRange::Canonicalize(ranges);
+ int range_count = ranges->length();
+#ifdef V8_INTL_SUPPORT
+ icu::UnicodeSet others;
+ for (int i = 0; i < range_count; i++) {
+ CharacterRange range = ranges->at(i);
+ base::uc32 from = range.from();
+ if (from > kMaxUtf16CodeUnit) continue;
+ base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
+ // Nothing to be done for surrogates.
+ if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
+ if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
+ if (from > kMaxOneByteCharCode) continue;
+ if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode;
+ }
+ others.add(from, to);
+ }
+
+ // Compute the set of additional characters that should be added,
+ // using UnicodeSet::closeOver. ECMA 262 defines slightly different
+ // case-folding rules than Unicode, so some characters that are
+ // added by closeOver do not match anything other than themselves in
+ // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
+ // same case-insensitive character as 's' or 'S' according to
+ // Unicode, but does not match any other character in JS. To handle
+ // this case, we add such characters to the IgnoreSet and filter
+ // them out. We filter twice: once before calling closeOver (to
+ // prevent 'ſ' from adding 's'), and once after calling closeOver
+ // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
+ // more information.
+ icu::UnicodeSet already_added(others);
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
+ others.closeOver(USET_CASE_INSENSITIVE);
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
+ others.removeAll(already_added);
+
+ // Add others to the ranges
+ for (int32_t i = 0; i < others.getRangeCount(); i++) {
+ UChar32 from = others.getRangeStart(i);
+ UChar32 to = others.getRangeEnd(i);
+ if (from == to) {
+ ranges->Add(CharacterRange::Singleton(from), zone);
+ } else {
+ ranges->Add(CharacterRange::Range(from, to), zone);
+ }
+ }
+#else
+ for (int i = 0; i < range_count; i++) {
+ CharacterRange range = ranges->at(i);
+ base::uc32 bottom = range.from();
+ if (bottom > kMaxUtf16CodeUnit) continue;
+ base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
+ // Nothing to be done for surrogates.
+ if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
+ if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
+ if (bottom > kMaxOneByteCharCode) continue;
+ if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode;
+ }
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ if (top == bottom) {
+ // If this is a singleton we just expand the one character.
+ int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
+ for (int i = 0; i < length; i++) {
+ base::uc32 chr = chars[i];
+ if (chr != bottom) {
+ ranges->Add(CharacterRange::Singleton(chars[i]), zone);
+ }
+ }
+ } else {
+ // If this is a range we expand the characters block by block, expanding
+ // contiguous subranges (blocks) one at a time. The approach is as
+ // follows. For a given start character we look up the remainder of the
+ // block that contains it (represented by the end point), for instance we
+ // find 'z' if the character is 'c'. A block is characterized by the
+ // property that all characters uncanonicalize in the same way, except
+ // that each entry in the result is incremented by the distance from the
+ // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
+ // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
+ // we've found the end point we look up its uncanonicalization and
+ // produce a range for each element. For instance for [c-f] we look up
+ // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
+ // it is not already contained in the input, so [c-f] will be skipped but
+ // [C-F] will be added. If this range is not completely contained in a
+ // block we do this for all the blocks covered by the range (handling
+ // characters that is not in a block as a "singleton block").
+ unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ base::uc32 pos = bottom;
+ while (pos <= top) {
+ int length =
+ isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
+ base::uc32 block_end;
+ if (length == 0) {
+ block_end = pos;
+ } else {
+ DCHECK_EQ(1, length);
+ block_end = equivalents[0];
+ }
+ int end = (block_end > top) ? top : block_end;
+ length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
+ equivalents);
+ for (int i = 0; i < length; i++) {
+ base::uc32 c = equivalents[i];
+ base::uc32 range_from = c - (block_end - pos);
+ base::uc32 range_to = c - (block_end - end);
+ if (!(bottom <= range_from && range_to <= top)) {
+ ranges->Add(CharacterRange::Range(range_from, range_to), zone);
+ }
+ }
+ pos = end + 1;
+ }
+ }
+ }
+#endif // V8_INTL_SUPPORT
+}
+
+bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) {
+ DCHECK_NOT_NULL(ranges);
+ int n = ranges->length();
+ if (n <= 1) return true;
+ base::uc32 max = ranges->at(0).to();
+ for (int i = 1; i < n; i++) {
+ CharacterRange next_range = ranges->at(i);
+ if (next_range.from() <= max + 1) return false;
+ max = next_range.to();
+ }
+ return true;
+}
+
+ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
+ if (ranges_ == nullptr) {
+ ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
+ zone);
+ }
+ return ranges_;
+}
+
+namespace {
+
+// Move a number of elements in a zonelist to another position
+// in the same list. Handles overlapping source and target areas.
+void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
+ // Ranges are potentially overlapping.
+ if (from < to) {
+ for (int i = count - 1; i >= 0; i--) {
+ list->at(to + i) = list->at(from + i);
+ }
+ } else {
+ for (int i = 0; i < count; i++) {
+ list->at(to + i) = list->at(from + i);
+ }
+ }
+}
+
+int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
+ CharacterRange insert) {
+ // Inserts a range into list[0..count[, which must be sorted
+ // by from value and non-overlapping and non-adjacent, using at most
+ // list[0..count] for the result. Returns the number of resulting
+ // canonicalized ranges. Inserting a range may collapse existing ranges into
+ // fewer ranges, so the return value can be anything in the range 1..count+1.
+ base::uc32 from = insert.from();
+ base::uc32 to = insert.to();
+ int start_pos = 0;
+ int end_pos = count;
+ for (int i = count - 1; i >= 0; i--) {
+ CharacterRange current = list->at(i);
+ if (current.from() > to + 1) {
+ end_pos = i;
+ } else if (current.to() + 1 < from) {
+ start_pos = i + 1;
+ break;
+ }
+ }
+
+ // Inserted range overlaps, or is adjacent to, ranges at positions
+ // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
+ // not affected by the insertion.
+ // If start_pos == end_pos, the range must be inserted before start_pos.
+ // if start_pos < end_pos, the entire range from start_pos to end_pos
+ // must be merged with the insert range.
+
+ if (start_pos == end_pos) {
+ // Insert between existing ranges at position start_pos.
+ if (start_pos < count) {
+ MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
+ }
+ list->at(start_pos) = insert;
+ return count + 1;
+ }
+ if (start_pos + 1 == end_pos) {
+ // Replace single existing range at position start_pos.
+ CharacterRange to_replace = list->at(start_pos);
+ int new_from = std::min(to_replace.from(), from);
+ int new_to = std::max(to_replace.to(), to);
+ list->at(start_pos) = CharacterRange::Range(new_from, new_to);
+ return count;
+ }
+ // Replace a number of existing ranges from start_pos to end_pos - 1.
+ // Move the remaining ranges down.
+
+ int new_from = std::min(list->at(start_pos).from(), from);
+ int new_to = std::max(list->at(end_pos - 1).to(), to);
+ if (end_pos < count) {
+ MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
+ }
+ list->at(start_pos) = CharacterRange::Range(new_from, new_to);
+ return count - (end_pos - start_pos) + 1;
+}
+
+} // namespace
+
+void CharacterSet::Canonicalize() {
+ // Special/default classes are always considered canonical. The result
+ // of calling ranges() will be sorted.
+ if (ranges_ == nullptr) return;
+ CharacterRange::Canonicalize(ranges_);
+}
+
+// static
+void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
+ if (character_ranges->length() <= 1) return;
+ // Check whether ranges are already canonical (increasing, non-overlapping,
+ // non-adjacent).
+ int n = character_ranges->length();
+ base::uc32 max = character_ranges->at(0).to();
+ int i = 1;
+ while (i < n) {
+ CharacterRange current = character_ranges->at(i);
+ if (current.from() <= max + 1) {
+ break;
+ }
+ max = current.to();
+ i++;
+ }
+ // Canonical until the i'th range. If that's all of them, we are done.
+ if (i == n) return;
+
+ // The ranges at index i and forward are not canonicalized. Make them so by
+ // doing the equivalent of insertion sort (inserting each into the previous
+ // list, in order).
+ // Notice that inserting a range can reduce the number of ranges in the
+ // result due to combining of adjacent and overlapping ranges.
+ int read = i; // Range to insert.
+ int num_canonical = i; // Length of canonicalized part of list.
+ do {
+ num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
+ character_ranges->at(read));
+ read++;
+ } while (read < n);
+ character_ranges->Rewind(num_canonical);
+
+ DCHECK(CharacterRange::IsCanonical(character_ranges));
+}
+
+// static
+void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,
+ ZoneList<CharacterRange>* negated_ranges,
+ Zone* zone) {
+ DCHECK(CharacterRange::IsCanonical(ranges));
+ DCHECK_EQ(0, negated_ranges->length());
+ int range_count = ranges->length();
+ base::uc32 from = 0;
+ int i = 0;
+ if (range_count > 0 && ranges->at(0).from() == 0) {
+ from = ranges->at(0).to() + 1;
+ i = 1;
+ }
+ while (i < range_count) {
+ CharacterRange range = ranges->at(i);
+ negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
+ from = range.to() + 1;
+ i++;
+ }
+ if (from < kMaxCodePoint) {
+ negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
+ }
+}
+
+// static
+void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,
+ const ZoneList<CharacterRange>* rhs,
+ ZoneList<CharacterRange>* intersection,
+ Zone* zone) {
+ DCHECK(CharacterRange::IsCanonical(lhs));
+ DCHECK(CharacterRange::IsCanonical(rhs));
+ DCHECK_EQ(0, intersection->length());
+ int lhs_index = 0;
+ int rhs_index = 0;
+ while (lhs_index < lhs->length() && rhs_index < rhs->length()) {
+ // Skip non-overlapping ranges.
+ if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {
+ lhs_index++;
+ continue;
+ }
+ if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {
+ rhs_index++;
+ continue;
+ }
+
+ base::uc32 from =
+ std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());
+ base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());
+ intersection->Add(CharacterRange::Range(from, to), zone);
+ if (to == lhs->at(lhs_index).to()) {
+ lhs_index++;
+ } else {
+ rhs_index++;
+ }
+ }
+
+ DCHECK(IsCanonical(intersection));
+}
+
+namespace {
+
+// Advance |index| and set |from| and |to| to the new range, if not out of
+// bounds of |range|, otherwise |from| is set to a code point beyond the legal
+// unicode character range.
+void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
+ base::uc32* from, base::uc32* to) {
+ ++(*index);
+ if (*index < range->length()) {
+ *from = range->at(*index).from();
+ *to = range->at(*index).to();
+ } else {
+ *from = kMaxCodePoint + 1;
+ }
+}
+
+} // namespace
+
+// static
+void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
+ const ZoneList<CharacterRange>* to_remove,
+ ZoneList<CharacterRange>* result, Zone* zone) {
+ DCHECK(CharacterRange::IsCanonical(src));
+ DCHECK(CharacterRange::IsCanonical(to_remove));
+ DCHECK_EQ(0, result->length());
+
+ if (src->is_empty()) return;
+
+ int src_index = 0;
+ int to_remove_index = 0;
+ base::uc32 from = src->at(src_index).from();
+ base::uc32 to = src->at(src_index).to();
+ while (src_index < src->length() && to_remove_index < to_remove->length()) {
+ CharacterRange remove_range = to_remove->at(to_remove_index);
+ if (remove_range.to() < from) {
+ // (a) Non-overlapping case, ignore current to_remove range.
+ // |-------|
+ // |-------|
+ to_remove_index++;
+ } else if (to < remove_range.from()) {
+ // (b) Non-overlapping case, add full current range to result.
+ // |-------|
+ // |-------|
+ result->Add(CharacterRange::Range(from, to), zone);
+ SafeAdvanceRange(src, &src_index, &from, &to);
+ } else if (from >= remove_range.from() && to <= remove_range.to()) {
+ // (c) Current to_remove range fully covers current range.
+ // |---|
+ // |-------|
+ SafeAdvanceRange(src, &src_index, &from, &to);
+ } else if (from < remove_range.from() && to > remove_range.to()) {
+ // (d) Split current range.
+ // |-------|
+ // |---|
+ result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);
+ from = remove_range.to() + 1;
+ to_remove_index++;
+ } else if (from < remove_range.from()) {
+ // (e) End current range.
+ // |-------|
+ // |-------|
+ to = remove_range.from() - 1;
+ result->Add(CharacterRange::Range(from, to), zone);
+ SafeAdvanceRange(src, &src_index, &from, &to);
+ } else if (to > remove_range.to()) {
+ // (f) Modify start of current range.
+ // |-------|
+ // |-------|
+ from = remove_range.to() + 1;
+ to_remove_index++;
+ } else {
+ UNREACHABLE();
+ }
+ }
+ // The last range needs special treatment after |to_remove| is exhausted, as
+ // |from| might have been modified by the last |to_remove| range and |to| was
+ // not yet known (i.e. cases d and f).
+ if (from <= to) {
+ result->Add(CharacterRange::Range(from, to), zone);
+ }
+ src_index++;
+
+ // Add remaining ranges after |to_remove| is exhausted.
+ for (; src_index < src->length(); src_index++) {
+ result->Add(src->at(src_index), zone);
+ }
+
+ DCHECK(IsCanonical(result));
+}
+
+// static
+void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
+ DCHECK(IsCanonical(ranges));
+
+ // Drop all ranges that don't contain one-byte code units, and clamp the last
+ // range s.t. it likewise only contains one-byte code units. Note this relies
+ // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
+
+ static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
+ int n = ranges->length();
+ for (; n > 0; n--) {
+ CharacterRange& r = ranges->at(n - 1);
+ if (r.from() <= max_char) {
+ r.to_ = std::min(r.to_, max_char);
+ break;
+ }
+ }
+
+ ranges->Rewind(n);
+}
+
+// static
+bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,
+ const ZoneList<CharacterRange>* rhs) {
+ DCHECK(IsCanonical(lhs));
+ DCHECK(IsCanonical(rhs));
+ if (lhs->length() != rhs->length()) return false;
+
+ for (int i = 0; i < lhs->length(); i++) {
+ if (lhs->at(i) != rhs->at(i)) return false;
+ }
+
+ return true;
+}
+
+namespace {
+
+// Scoped object to keep track of how much we unroll quantifier loops in the
+// regexp graph generator.
+class RegExpExpansionLimiter {
+ public:
+ static const int kMaxExpansionFactor = 6;
+ RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
+ : compiler_(compiler),
+ saved_expansion_factor_(compiler->current_expansion_factor()),
+ ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
+ DCHECK_LT(0, factor);
+ if (ok_to_expand_) {
+ if (factor > kMaxExpansionFactor) {
+ // Avoid integer overflow of the current expansion factor.
+ ok_to_expand_ = false;
+ compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
+ } else {
+ int new_factor = saved_expansion_factor_ * factor;
+ ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
+ compiler->set_current_expansion_factor(new_factor);
+ }
+ }
+ }
+
+ ~RegExpExpansionLimiter() {
+ compiler_->set_current_expansion_factor(saved_expansion_factor_);
+ }
+
+ bool ok_to_expand() { return ok_to_expand_; }
+
+ private:
+ RegExpCompiler* compiler_;
+ int saved_expansion_factor_;
+ bool ok_to_expand_;
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
+};
+
+} // namespace
+
+RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
+ RegExpTree* body, RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ bool not_at_start) {
+ // x{f, t} becomes this:
+ //
+ // (r++)<-.
+ // | `
+ // | (x)
+ // v ^
+ // (r=0)-->(?)---/ [if r < t]
+ // |
+ // [if r >= f] \----> ...
+ //
+
+ // 15.10.2.5 RepeatMatcher algorithm.
+ // The parser has already eliminated the case where max is 0. In the case
+ // where max_match is zero the parser has removed the quantifier if min was
+ // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
+
+ // If we know that we cannot match zero length then things are a little
+ // simpler since we don't need to make the special zero length match check
+ // from step 2.1. If the min and max are small we can unroll a little in
+ // this case.
+ static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
+ static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
+ if (max == 0) return on_success; // This can happen due to recursion.
+ bool body_can_be_empty = (body->min_match() == 0);
+ int body_start_reg = RegExpCompiler::kNoRegister;
+ Interval capture_registers = body->CaptureRegisters();
+ bool needs_capture_clearing = !capture_registers.is_empty();
+ Zone* zone = compiler->zone();
+
+ if (body_can_be_empty) {
+ body_start_reg = compiler->AllocateRegister();
+ } else if (compiler->optimize() && !needs_capture_clearing) {
+ // Only unroll if there are no captures and the body can't be
+ // empty.
+ {
+ RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
+ if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
+ int new_max = (max == kInfinity) ? max : max - min;
+ // Recurse once to get the loop or optional matches after the fixed
+ // ones.
+ RegExpNode* answer =
+ ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
+ // Unroll the forced matches from 0 to min. This can cause chains of
+ // TextNodes (which the parser does not generate). These should be
+ // combined if it turns out they hinder good code generation.
+ for (int i = 0; i < min; i++) {
+ answer = body->ToNode(compiler, answer);
+ }
+ return answer;
+ }
+ }
+ if (max <= kMaxUnrolledMaxMatches && min == 0) {
+ DCHECK_LT(0, max); // Due to the 'if' above.
+ RegExpExpansionLimiter limiter(compiler, max);
+ if (limiter.ok_to_expand()) {
+ // Unroll the optional matches up to max.
+ RegExpNode* answer = on_success;
+ for (int i = 0; i < max; i++) {
+ ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
+ if (is_greedy) {
+ alternation->AddAlternative(
+ GuardedAlternative(body->ToNode(compiler, answer)));
+ alternation->AddAlternative(GuardedAlternative(on_success));
+ } else {
+ alternation->AddAlternative(GuardedAlternative(on_success));
+ alternation->AddAlternative(
+ GuardedAlternative(body->ToNode(compiler, answer)));
+ }
+ answer = alternation;
+ if (not_at_start && !compiler->read_backward()) {
+ alternation->set_not_at_start();
+ }
+ }
+ return answer;
+ }
+ }
+ }
+ bool has_min = min > 0;
+ bool has_max = max < RegExpTree::kInfinity;
+ bool needs_counter = has_min || has_max;
+ int reg_ctr = needs_counter ? compiler->AllocateRegister()
+ : RegExpCompiler::kNoRegister;
+ LoopChoiceNode* center = zone->New<LoopChoiceNode>(
+ body->min_match() == 0, compiler->read_backward(), min, zone);
+ if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
+ RegExpNode* loop_return =
+ needs_counter ? static_cast<RegExpNode*>(
+ ActionNode::IncrementRegister(reg_ctr, center))
+ : static_cast<RegExpNode*>(center);
+ if (body_can_be_empty) {
+ // If the body can be empty we need to check if it was and then
+ // backtrack.
+ loop_return =
+ ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
+ }
+ RegExpNode* body_node = body->ToNode(compiler, loop_return);
+ if (body_can_be_empty) {
+ // If the body can be empty we need to store the start position
+ // so we can bail out if it was empty.
+ body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
+ }
+ if (needs_capture_clearing) {
+ // Before entering the body of this loop we need to clear captures.
+ body_node = ActionNode::ClearCaptures(capture_registers, body_node);
+ }
+ GuardedAlternative body_alt(body_node);
+ if (has_max) {
+ Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
+ body_alt.AddGuard(body_guard, zone);
+ }
+ GuardedAlternative rest_alt(on_success);
+ if (has_min) {
+ Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
+ rest_alt.AddGuard(rest_guard, zone);
+ }
+ if (is_greedy) {
+ center->AddLoopAlternative(body_alt);
+ center->AddContinueAlternative(rest_alt);
+ } else {
+ center->AddContinueAlternative(rest_alt);
+ center->AddLoopAlternative(body_alt);
+ }
+ if (needs_counter) {
+ return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
+ } else {
+ return center;
+ }
+}
+
+} // namespace internal
+} // namespace v8