From 40a355a42d4a9444dc753c04c6608dade2f06a23 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:13:27 +0200 Subject: Adding upstream version 125.0.1. Signed-off-by: Daniel Baumann --- .../irregexp/imported/gen-regexp-special-case.cc | 48 --- js/src/irregexp/imported/regexp-ast.cc | 23 +- js/src/irregexp/imported/regexp-ast.h | 42 ++- .../irregexp/imported/regexp-bytecode-generator.cc | 2 +- .../irregexp/imported/regexp-bytecode-peephole.cc | 4 +- js/src/irregexp/imported/regexp-compiler-tonode.cc | 99 +++--- js/src/irregexp/imported/regexp-compiler.cc | 43 ++- js/src/irregexp/imported/regexp-compiler.h | 8 +- js/src/irregexp/imported/regexp-dotprinter.cc | 4 + js/src/irregexp/imported/regexp-interpreter.cc | 84 ++--- js/src/irregexp/imported/regexp-interpreter.h | 13 +- js/src/irregexp/imported/regexp-macro-assembler.cc | 70 ++-- js/src/irregexp/imported/regexp-macro-assembler.h | 23 +- js/src/irregexp/imported/regexp-nodes.h | 18 +- js/src/irregexp/imported/regexp-parser.cc | 393 ++++++++++++++------- js/src/irregexp/imported/regexp.h | 12 +- js/src/irregexp/imported/special-case.cc | 23 -- js/src/irregexp/imported/special-case.h | 10 - 18 files changed, 524 insertions(+), 395 deletions(-) (limited to 'js/src/irregexp/imported') diff --git a/js/src/irregexp/imported/gen-regexp-special-case.cc b/js/src/irregexp/imported/gen-regexp-special-case.cc index 8f6557ed30..0875568250 100644 --- a/js/src/irregexp/imported/gen-regexp-special-case.cc +++ b/js/src/irregexp/imported/gen-regexp-special-case.cc @@ -8,7 +8,6 @@ #include #include "irregexp/imported/special-case.h" -#include "unicode/usetiter.h" namespace v8 { namespace internal { @@ -126,52 +125,6 @@ void PrintSpecial(std::ofstream& out) { PrintSet(out, "SpecialAddSet", special_add); } -void PrintUnicodeSpecial(std::ofstream& out) { - icu::UnicodeSet non_simple_folding; - icu::UnicodeSet current; - UErrorCode status = U_ZERO_ERROR; - // Look at all characters except white spaces. - icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status); - CHECK_EQ(status, U_ZERO_ERROR); - icu::UnicodeSetIterator iter(interestingCP); - while (iter.next()) { - UChar32 c = iter.getCodepoint(); - current.set(c, c); - current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings(); - CHECK(!current.isBogus()); - // Remove characters from the closeover that have a simple case folding. - icu::UnicodeSet toRemove; - icu::UnicodeSetIterator closeOverIter(current); - while (closeOverIter.next()) { - UChar32 closeOverChar = closeOverIter.getCodepoint(); - UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT); - if (closeOverChar != closeOverSCF) { - toRemove.add(closeOverChar); - } - } - CHECK(!toRemove.isBogus()); - current.removeAll(toRemove); - - // The current character and its simple case folding are also always OK. - UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT); - current.remove(c); - current.remove(scf); - - // If there are any characters remaining, they were added due to full case - // foldings and shouldn't match the current charcter according to the spec. - if (!current.isEmpty()) { - // Ensure that the character doesn't have a simple case folding. - // Otherwise the current approach of simply removing the character from - // the set before calling closeOver won't work. - CHECK_EQ(c, scf); - non_simple_folding.add(c); - } - } - CHECK(!non_simple_folding.isBogus()); - - PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding); -} - void WriteHeader(const char* header_filename) { std::ofstream out(header_filename); out << std::hex << std::setfill('0') << std::setw(4); @@ -192,7 +145,6 @@ void WriteHeader(const char* header_filename) { << "namespace internal {\n\n"; PrintSpecial(out); - PrintUnicodeSpecial(out); out << "\n" << "} // namespace internal\n" diff --git a/js/src/irregexp/imported/regexp-ast.cc b/js/src/irregexp/imported/regexp-ast.cc index 63eeb5c05d..34946bd80c 100644 --- a/js/src/irregexp/imported/regexp-ast.cc +++ b/js/src/irregexp/imported/regexp-ast.cc @@ -307,7 +307,7 @@ void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) { } void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) { - os_ << "(?: "; + os_ << "(?" << that->flags() << ": "; that->body()->Accept(this, data); os_ << ")"; return nullptr; @@ -325,7 +325,11 @@ void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) { void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, void* data) { - os_ << "(<- " << that->index() << ")"; + os_ << "(<- " << that->captures()->first()->index(); + for (int i = 1; i < that->captures()->length(); ++i) { + os_ << "," << that->captures()->at(i)->index(); + } + os_ << ")"; return nullptr; } @@ -406,10 +410,17 @@ RegExpClassSetExpression::RegExpClassSetExpression( may_contain_strings_(may_contain_strings), operands_(operands) { DCHECK_NOT_NULL(operands); - DCHECK_IMPLIES(is_negated_, !may_contain_strings_); - max_match_ = 0; - for (auto op : *operands) { - max_match_ = std::max(max_match_, op->max_match()); + if (is_negated) { + DCHECK(!may_contain_strings_); + // We don't know anything about max matches for negated classes. + // As there are no strings involved, assume that we can match a unicode + // character (2 code points). + max_match_ = 2; + } else { + max_match_ = 0; + for (auto op : *operands) { + max_match_ = std::max(max_match_, op->max_match()); + } } } diff --git a/js/src/irregexp/imported/regexp-ast.h b/js/src/irregexp/imported/regexp-ast.h index af90b1dda3..b2b88515d3 100644 --- a/js/src/irregexp/imported/regexp-ast.h +++ b/js/src/irregexp/imported/regexp-ast.h @@ -130,12 +130,6 @@ class CharacterRange { static void AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone); -#ifdef V8_INTL_SUPPORT - // Creates the closeOver of the given UnicodeSet, removing all - // characters/strings that can't be derived via simple case folding. - static void UnicodeSimpleCloseOver(icu::UnicodeSet& set); -#endif // V8_INTL_SUPPORT - bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } base::uc32 from() const { return from_; } base::uc32 to() const { return to_; } @@ -311,9 +305,12 @@ class RegExpClassRanges final : public RegExpTree { // the specified ranges. // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split // surrogate and should not be unicode-desugared (crbug.com/641091). + // IS_CASE_FOLDED: If case folding is required (/i), it was already + // performed on individual ranges and should not be applied again. enum Flag { NEGATED = 1 << 0, CONTAINS_SPLIT_SURROGATE = 1 << 1, + IS_CASE_FOLDED = 1 << 2, }; using ClassRangesFlags = base::Flags; @@ -356,6 +353,9 @@ class RegExpClassRanges final : public RegExpTree { bool contains_split_surrogate() const { return (class_ranges_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; } + bool is_case_folded() const { + return (class_ranges_flags_ & IS_CASE_FOLDED) != 0; + } private: CharacterSet set_; @@ -626,8 +626,9 @@ class RegExpCapture final : public RegExpTree { class RegExpGroup final : public RegExpTree { public: - explicit RegExpGroup(RegExpTree* body) + explicit RegExpGroup(RegExpTree* body, RegExpFlags flags) : body_(body), + flags_(flags), min_match_(body->min_match()), max_match_(body->max_match()) {} @@ -639,9 +640,11 @@ class RegExpGroup final : public RegExpTree { int max_match() override { return max_match_; } Interval CaptureRegisters() override { return body_->CaptureRegisters(); } RegExpTree* body() const { return body_; } + RegExpFlags flags() const { return flags_; } private: RegExpTree* body_; + const RegExpFlags flags_; int min_match_; int max_match_; }; @@ -651,12 +654,13 @@ class RegExpLookaround final : public RegExpTree { enum Type { LOOKAHEAD, LOOKBEHIND }; RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, - int capture_from, Type type) + int capture_from, Type type, int index) : body_(body), is_positive_(is_positive), capture_count_(capture_count), capture_from_(capture_from), - type_(type) {} + type_(type), + index_(index) {} DECL_BOILERPLATE(Lookaround); @@ -669,6 +673,7 @@ class RegExpLookaround final : public RegExpTree { int capture_count() const { return capture_count_; } int capture_from() const { return capture_from_; } Type type() const { return type_; } + int index() const { return index_; } class Builder { public: @@ -692,14 +697,17 @@ class RegExpLookaround final : public RegExpTree { int capture_count_; int capture_from_; Type type_; + int index_; }; class RegExpBackReference final : public RegExpTree { public: - explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {} - RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) - : capture_(capture), flags_(flags) {} + explicit RegExpBackReference(Zone* zone) : captures_(1, zone) {} + explicit RegExpBackReference(RegExpCapture* capture, Zone* zone) + : captures_(1, zone) { + captures_.Add(capture, zone); + } DECL_BOILERPLATE(BackReference); @@ -707,16 +715,16 @@ class RegExpBackReference final : public RegExpTree { // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite // recursion, we give up. Ignorance is bliss. int max_match() override { return kInfinity; } - int index() const { return capture_->index(); } - RegExpCapture* capture() const { return capture_; } - void set_capture(RegExpCapture* capture) { capture_ = capture; } + const ZoneList* captures() const { return &captures_; } + void add_capture(RegExpCapture* capture, Zone* zone) { + captures_.Add(capture, zone); + } const ZoneVector* name() const { return name_; } void set_name(const ZoneVector* name) { name_ = name; } private: - RegExpCapture* capture_ = nullptr; + ZoneList captures_; const ZoneVector* name_ = nullptr; - const RegExpFlags flags_; }; diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.cc b/js/src/irregexp/imported/regexp-bytecode-generator.cc index c83e10a598..251ed1cda5 100644 --- a/js/src/irregexp/imported/regexp-bytecode-generator.cc +++ b/js/src/irregexp/imported/regexp-bytecode-generator.cc @@ -383,7 +383,7 @@ Handle RegExpBytecodeGenerator::GetCode(Handle source) { isolate_, zone(), source, buffer_.data(), length(), jump_edges_); } else { array = isolate_->factory()->NewByteArray(length()); - Copy(array->GetDataStartAddress()); + Copy(array->begin()); } return array; diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.cc b/js/src/irregexp/imported/regexp-bytecode-peephole.cc index ec8dcf1108..0ef0bab702 100644 --- a/js/src/irregexp/imported/regexp-bytecode-peephole.cc +++ b/js/src/irregexp/imported/regexp-bytecode-peephole.cc @@ -1012,13 +1012,13 @@ Handle RegExpBytecodePeepholeOptimization::OptimizeBytecode( RegExpBytecodePeephole peephole(zone, length, jump_edges); bool did_optimize = peephole.OptimizeBytecode(bytecode, length); Handle array = isolate->factory()->NewByteArray(peephole.Length()); - peephole.CopyOptimizedBytecode(array->GetDataStartAddress()); + peephole.CopyOptimizedBytecode(array->begin()); if (did_optimize && v8_flags.trace_regexp_peephole_optimization) { PrintF("Original Bytecode:\n"); RegExpBytecodeDisassemble(bytecode, length, source->ToCString().get()); PrintF("Optimized Bytecode:\n"); - RegExpBytecodeDisassemble(array->GetDataStartAddress(), peephole.Length(), + RegExpBytecodeDisassemble(array->begin(), peephole.Length(), source->ToCString().get()); } diff --git a/js/src/irregexp/imported/regexp-compiler-tonode.cc b/js/src/irregexp/imported/regexp-compiler-tonode.cc index f5087bdb08..b1340123d8 100644 --- a/js/src/irregexp/imported/regexp-compiler-tonode.cc +++ b/js/src/irregexp/imported/regexp-compiler-tonode.cc @@ -3,7 +3,6 @@ // found in the LICENSE file. #include "irregexp/imported/regexp-compiler.h" - #include "irregexp/imported/regexp.h" #ifdef V8_INTL_SUPPORT @@ -418,27 +417,6 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, } // namespace -#ifdef V8_INTL_SUPPORT -// static -void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) { - // Remove characters for which closeOver() adds full-case-folding equivalents - // because we should work only with simple case folding mappings. - icu::UnicodeSet non_simple = icu::UnicodeSet(set); - non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet()); - set.removeAll(non_simple); - - set.closeOver(USET_CASE_INSENSITIVE); - // Full case folding maps single characters to multiple characters. - // Those are represented as strings in the set. Remove them so that - // we end up with only simple and common case mappings. - set.removeAllStrings(); - - // Add characters that have non-simple case foldings again (they match - // themselves). - set.addAll(non_simple); -} -#endif // V8_INTL_SUPPORT - // static void CharacterRange::AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone) { @@ -460,8 +438,7 @@ void CharacterRange::AddUnicodeCaseEquivalents(ZoneList* ranges, } // Clear the ranges list without freeing the backing store. ranges->Rewind(0); - - UnicodeSimpleCloseOver(set); + set.closeOver(USET_SIMPLE_CASE_INSENSITIVE); for (int i = 0; i < set.getRangeCount(); i++) { ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone); } @@ -476,7 +453,9 @@ RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler, Zone* const zone = compiler->zone(); ZoneList* ranges = this->ranges(zone); - if (NeedsUnicodeCaseEquivalents(compiler->flags())) { + const bool needs_case_folding = + NeedsUnicodeCaseEquivalents(compiler->flags()) && !is_case_folded(); + if (needs_case_folding) { CharacterRange::AddUnicodeCaseEquivalents(ranges, zone); } @@ -487,8 +466,7 @@ RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler, if (is_negated()) { // With /v, character classes are never negated. - // TODO(v8:11935): Change permalink once proposal is in stage 4. - // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom + // https://tc39.es/ecma262/#sec-compileatom // Atom :: CharacterClass // 4. Assert: cc.[[Invert]] is false. // Instead the complement is created when evaluating the class set. @@ -561,7 +539,12 @@ RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler, } } if (!ranges()->is_empty()) { - alternatives->Add(zone->template New(zone, ranges()), + // In unicode sets mode case folding has to be done at precise locations + // (e.g. before building complements). + // It is therefore the parsers responsibility to case fold (sub-) ranges + // before creating ClassSetOperands. + alternatives->Add(zone->template New( + zone, ranges(), RegExpClassRanges::IS_CASE_FOLDED), zone); } if (empty_string != nullptr) { @@ -1034,9 +1017,8 @@ namespace { // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, RegExpNode* on_success, - RegExpAssertion::Type type, - RegExpFlags flags) { - CHECK(NeedsUnicodeCaseEquivalents(flags)); + RegExpAssertion::Type type) { + CHECK(NeedsUnicodeCaseEquivalents(compiler->flags())); Zone* zone = compiler->zone(); ZoneList* word_range = zone->New>(2, zone); @@ -1080,14 +1062,13 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, return AssertionNode::AtStart(on_success); case Type::BOUNDARY: return NeedsUnicodeCaseEquivalents(compiler->flags()) - ? BoundaryAssertionAsLookaround( - compiler, on_success, Type::BOUNDARY, compiler->flags()) + ? BoundaryAssertionAsLookaround(compiler, on_success, + Type::BOUNDARY) : AssertionNode::AtBoundary(on_success); case Type::NON_BOUNDARY: return NeedsUnicodeCaseEquivalents(compiler->flags()) ? BoundaryAssertionAsLookaround(compiler, on_success, - Type::NON_BOUNDARY, - compiler->flags()) + Type::NON_BOUNDARY) : AssertionNode::AtNonBoundary(on_success); case Type::END_OF_INPUT: return AssertionNode::AtEnd(on_success); @@ -1130,10 +1111,17 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - return compiler->zone()->New( - RegExpCapture::StartRegister(index()), - RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(), - on_success); + RegExpNode* backref_node = on_success; + // Only one of the captures in the list can actually match. Since + // back-references to unmatched captures are treated as empty, we can simply + // create back-references to all possible captures. + for (auto capture : *captures()) { + backref_node = compiler->zone()->New( + RegExpCapture::StartRegister(capture->index()), + RegExpCapture::EndRegister(capture->index()), compiler->read_backward(), + backref_node); + } + return backref_node; } RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, @@ -1141,9 +1129,40 @@ RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, return on_success; } +namespace { + +class V8_NODISCARD ModifiersScope { + public: + ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags) + : compiler_(compiler), previous_flags_(compiler->flags()) { + compiler->set_flags(flags); + } + ~ModifiersScope() { compiler_->set_flags(previous_flags_); } + + private: + RegExpCompiler* compiler_; + const RegExpFlags previous_flags_; +}; + +} // namespace + RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - return body_->ToNode(compiler, on_success); + // If no flags are modified, simply convert and return the body. + if (flags() == compiler->flags()) { + return body_->ToNode(compiler, on_success); + } + // Reset flags for successor node. + const RegExpFlags old_flags = compiler->flags(); + on_success = ActionNode::ModifyFlags(old_flags, on_success); + + // Convert body using modifier. + ModifiersScope modifiers_scope(compiler, flags()); + RegExpNode* body = body_->ToNode(compiler, on_success); + + // Wrap body into modifier node. + RegExpNode* modified_body = ActionNode::ModifyFlags(flags(), body); + return modified_body; } RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, diff --git a/js/src/irregexp/imported/regexp-compiler.cc b/js/src/irregexp/imported/regexp-compiler.cc index 514975d8ed..73dfe1d2ad 100644 --- a/js/src/irregexp/imported/regexp-compiler.cc +++ b/js/src/irregexp/imported/regexp-compiler.cc @@ -707,6 +707,13 @@ ActionNode* ActionNode::EmptyMatchCheck(int start_register, return result; } +ActionNode* ActionNode::ModifyFlags(RegExpFlags flags, RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New(MODIFY_FLAGS, on_success); + result->data_.u_modify_flags.flags = flags; + return result; +} + #define DEFINE_ACCEPT(Type) \ void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); } FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) @@ -1377,6 +1384,9 @@ void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details, on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler, filled_in, not_at_start); } else { + if (action_type() == MODIFY_FLAGS) { + compiler->set_flags(flags()); + } on_success()->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); } @@ -2867,7 +2877,7 @@ int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead, const int kSkipArrayEntry = 0; const int kDontSkipArrayEntry = 1; - std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry, + std::memset(boolean_skip_table->begin(), kSkipArrayEntry, boolean_skip_table->length()); for (int i = max_lookahead; i >= min_lookahead; i--) { @@ -3454,6 +3464,11 @@ void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { assembler->Backtrack(); return; } + case MODIFY_FLAGS: { + compiler->set_flags(flags()); + on_success()->Emit(compiler, trace); + break; + } default: UNREACHABLE(); } @@ -3473,8 +3488,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RecursionCheck rc(compiler); DCHECK_EQ(start_reg_ + 1, end_reg_); - if (IsIgnoreCase(flags_)) { - bool unicode = IsEitherUnicode(flags_); + if (IsIgnoreCase(compiler->flags())) { + bool unicode = IsEitherUnicode(compiler->flags()); assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), unicode, trace->backtrack()); } else { @@ -3485,7 +3500,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { if (read_backward()) trace->set_at_start(Trace::UNKNOWN); // Check that the back reference does not end inside a surrogate pair. - if (IsEitherUnicode(flags_) && !compiler->one_byte()) { + if (IsEitherUnicode(compiler->flags()) && !compiler->one_byte()) { assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); } on_success()->Emit(compiler, trace); @@ -3707,7 +3722,7 @@ class Analysis : public NodeVisitor { } while (false) void VisitText(TextNode* that) override { - that->MakeCaseIndependent(isolate(), is_one_byte_, flags_); + that->MakeCaseIndependent(isolate(), is_one_byte_, flags()); EnsureAnalyzed(that->on_success()); if (has_failed()) return; that->CalculateOffsets(); @@ -3715,6 +3730,9 @@ class Analysis : public NodeVisitor { } void VisitAction(ActionNode* that) override { + if (that->action_type() == ActionNode::MODIFY_FLAGS) { + set_flags(that->flags()); + } EnsureAnalyzed(that->on_success()); if (has_failed()) return; STATIC_FOR_EACH(Propagators::VisitAction(that)); @@ -3773,9 +3791,12 @@ class Analysis : public NodeVisitor { #undef STATIC_FOR_EACH private: + RegExpFlags flags() const { return flags_; } + void set_flags(RegExpFlags flags) { flags_ = flags; } + Isolate* isolate_; const bool is_one_byte_; - const RegExpFlags flags_; + RegExpFlags flags_; RegExpError error_; DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); @@ -3903,13 +3924,12 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( } RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, - RegExpFlags flags, bool is_one_byte) { // Wrap the body of the regexp in capture #0. RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 0, this, accept()); RegExpNode* node = captured_body; - if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) { + if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags())) { // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning or sticky. RegExpNode* loop_node = RegExpQuantifier::ToNode( @@ -3931,13 +3951,14 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags()); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != nullptr) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags()); } - } else if (IsEitherUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) { + } else if (IsEitherUnicode(flags()) && + (IsGlobal(flags()) || IsSticky(flags()))) { node = OptionallyStepBackToLeadSurrogate(node); } diff --git a/js/src/irregexp/imported/regexp-compiler.h b/js/src/irregexp/imported/regexp-compiler.h index 91dd43ab8a..7a369430bb 100644 --- a/js/src/irregexp/imported/regexp-compiler.h +++ b/js/src/irregexp/imported/regexp-compiler.h @@ -501,8 +501,7 @@ class RegExpCompiler { // - Inserting the implicit .* before/after the regexp if necessary. // - If the input is a one-byte string, filtering out nodes that can't match. // - Fixing up regexp matches that start within a surrogate pair. - RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags, - bool is_one_byte); + RegExpNode* PreprocessRegExp(RegExpCompileData* data, bool is_one_byte); // If the regexp matching starts within a surrogate pair, step back to the // lead surrogate and start matching from there. @@ -527,7 +526,8 @@ class RegExpCompiler { inline void IncrementRecursionDepth() { recursion_depth_++; } inline void DecrementRecursionDepth() { recursion_depth_--; } - RegExpFlags flags() const { return flags_; } + inline RegExpFlags flags() const { return flags_; } + inline void set_flags(RegExpFlags flags) { flags_ = flags; } void SetRegExpTooBig() { reg_exp_too_big_ = true; } @@ -571,7 +571,7 @@ class RegExpCompiler { int unicode_lookaround_position_register_; ZoneVector* work_list_; int recursion_depth_; - const RegExpFlags flags_; + RegExpFlags flags_; RegExpMacroAssembler* macro_assembler_; bool one_byte_; bool reg_exp_too_big_; diff --git a/js/src/irregexp/imported/regexp-dotprinter.cc b/js/src/irregexp/imported/regexp-dotprinter.cc index 6746992a0a..cd0ca5dea8 100644 --- a/js/src/irregexp/imported/regexp-dotprinter.cc +++ b/js/src/irregexp/imported/regexp-dotprinter.cc @@ -231,6 +231,10 @@ void DotPrinterImpl::VisitAction(ActionNode* that) { << "\", shape=septagon"; break; } + case ActionNode::MODIFY_FLAGS: { + os_ << "label=\"flags $" << that->flags() << "\", shape=septagon"; + break; + } } os_ << "];\n"; PrintAttributes(that); diff --git a/js/src/irregexp/imported/regexp-interpreter.cc b/js/src/irregexp/imported/regexp-interpreter.cc index 43c8a4a5a4..2de1b12968 100644 --- a/js/src/irregexp/imported/regexp-interpreter.cc +++ b/js/src/irregexp/imported/regexp-interpreter.cc @@ -88,8 +88,7 @@ int32_t Load32Aligned(const uint8_t* pc) { return *reinterpret_cast(pc); } -// TODO(jgruber): Rename to Load16AlignedUnsigned. -uint32_t Load16Aligned(const uint8_t* pc) { +uint32_t Load16AlignedUnsigned(const uint8_t* pc) { DCHECK_EQ(0, reinterpret_cast(pc) & 1); return *reinterpret_cast(pc); } @@ -221,17 +220,17 @@ IrregexpInterpreter::Result MaybeThrowStackOverflow( template void UpdateCodeAndSubjectReferences( Isolate* isolate, Handle code_array, - Handle subject_string, ByteArray* code_array_out, + Handle subject_string, Tagged* code_array_out, const uint8_t** code_base_out, const uint8_t** pc_out, - String* subject_string_out, + Tagged* subject_string_out, base::Vector* subject_string_vector_out) { DisallowGarbageCollection no_gc; - if (*code_base_out != code_array->GetDataStartAddress()) { + if (*code_base_out != code_array->begin()) { *code_array_out = *code_array; const intptr_t pc_offset = *pc_out - *code_base_out; DCHECK_GT(pc_offset, 0); - *code_base_out = code_array->GetDataStartAddress(); + *code_base_out = code_array->begin(); *pc_out = *code_base_out + pc_offset; } @@ -244,8 +243,9 @@ void UpdateCodeAndSubjectReferences( // necessary. template IrregexpInterpreter::Result HandleInterrupts( - Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out, - String* subject_string_out, const uint8_t** code_base_out, + Isolate* isolate, RegExp::CallOrigin call_origin, + Tagged* code_array_out, Tagged* subject_string_out, + const uint8_t** code_base_out, base::Vector* subject_string_vector_out, const uint8_t** pc_out) { DisallowGarbageCollection no_gc; @@ -276,12 +276,12 @@ IrregexpInterpreter::Result HandleInterrupts( } else if (check.InterruptRequested()) { const bool was_one_byte = String::IsOneByteRepresentationUnderneath(*subject_string_out); - Object result; + Tagged result; { AllowGarbageCollection yes_gc; result = isolate->stack_guard()->HandleInterrupts(); } - if (result.IsException(isolate)) { + if (IsException(result, isolate)) { return IrregexpInterpreter::EXCEPTION; } @@ -375,10 +375,10 @@ bool IndexIsInBounds(int index, int length) { template IrregexpInterpreter::Result RawMatch( - Isolate* isolate, ByteArray code_array, String subject_string, - base::Vector subject, int* output_registers, - int output_register_count, int total_register_count, int current, - uint32_t current_char, RegExp::CallOrigin call_origin, + Isolate* isolate, Tagged code_array, + Tagged subject_string, base::Vector subject, + int* output_registers, int output_register_count, int total_register_count, + int current, uint32_t current_char, RegExp::CallOrigin call_origin, const uint32_t backtrack_limit) { DisallowGarbageCollection no_gc; @@ -430,7 +430,7 @@ IrregexpInterpreter::Result RawMatch( #endif // V8_USE_COMPUTED_GOTO - const uint8_t* pc = code_array.GetDataStartAddress(); + const uint8_t* pc = code_array->begin(); const uint8_t* code_base = pc; InterpreterRegisters registers(total_register_count, output_registers, @@ -702,8 +702,8 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { uint32_t c = LoadPacked24Unsigned(insn); - uint32_t minus = Load16Aligned(pc + 4); - uint32_t mask = Load16Aligned(pc + 6); + uint32_t minus = Load16AlignedUnsigned(pc + 4); + uint32_t mask = Load16AlignedUnsigned(pc + 6); if (c != ((current_char - minus) & mask)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { @@ -712,8 +712,8 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_CHAR_IN_RANGE) { - uint32_t from = Load16Aligned(pc + 4); - uint32_t to = Load16Aligned(pc + 6); + uint32_t from = Load16AlignedUnsigned(pc + 4); + uint32_t to = Load16AlignedUnsigned(pc + 6); if (from <= current_char && current_char <= to) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { @@ -722,8 +722,8 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { - uint32_t from = Load16Aligned(pc + 4); - uint32_t to = Load16Aligned(pc + 6); + uint32_t from = Load16AlignedUnsigned(pc + 4); + uint32_t to = Load16AlignedUnsigned(pc + 6); if (from > current_char || current_char > to) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { @@ -914,7 +914,7 @@ IrregexpInterpreter::Result RawMatch( BYTECODE(SKIP_UNTIL_CHAR) { int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); - uint32_t c = Load16Aligned(pc + 6); + uint32_t c = Load16AlignedUnsigned(pc + 6); while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; if (c == current_char) { @@ -929,7 +929,7 @@ IrregexpInterpreter::Result RawMatch( BYTECODE(SKIP_UNTIL_CHAR_AND) { int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); - uint16_t c = Load16Aligned(pc + 6); + uint16_t c = Load16AlignedUnsigned(pc + 6); uint32_t mask = Load32Aligned(pc + 8); int32_t maximum_offset = Load32Aligned(pc + 12); while (static_cast(current + maximum_offset) <= @@ -947,7 +947,7 @@ IrregexpInterpreter::Result RawMatch( BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) { int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); - uint16_t c = Load16Aligned(pc + 6); + uint16_t c = Load16AlignedUnsigned(pc + 6); int32_t maximum_offset = Load32Aligned(pc + 8); while (static_cast(current + maximum_offset) <= static_cast(subject.length())) { @@ -979,7 +979,7 @@ IrregexpInterpreter::Result RawMatch( BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) { int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); - uint16_t limit = Load16Aligned(pc + 6); + uint16_t limit = Load16AlignedUnsigned(pc + 6); const uint8_t* table = pc + 8; while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; @@ -999,8 +999,8 @@ IrregexpInterpreter::Result RawMatch( BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) { int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load32Aligned(pc + 4); - uint16_t c = Load16Aligned(pc + 8); - uint16_t c2 = Load16Aligned(pc + 10); + uint16_t c = Load16AlignedUnsigned(pc + 8); + uint16_t c2 = Load16AlignedUnsigned(pc + 10); while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; // The two if-statements below are split up intentionally, as combining @@ -1047,29 +1047,29 @@ IrregexpInterpreter::Result RawMatch( // static IrregexpInterpreter::Result IrregexpInterpreter::Match( - Isolate* isolate, JSRegExp regexp, String subject_string, + Isolate* isolate, Tagged regexp, Tagged subject_string, int* output_registers, int output_register_count, int start_position, RegExp::CallOrigin call_origin) { - if (v8_flags.regexp_tier_up) regexp.TierUpTick(); + if (v8_flags.regexp_tier_up) regexp->TierUpTick(); bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string); - ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte)); - int total_register_count = regexp.max_register_count(); + Tagged code_array = ByteArray::cast(regexp->bytecode(is_one_byte)); + int total_register_count = regexp->max_register_count(); return MatchInternal(isolate, code_array, subject_string, output_registers, output_register_count, total_register_count, - start_position, call_origin, regexp.backtrack_limit()); + start_position, call_origin, regexp->backtrack_limit()); } IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( - Isolate* isolate, ByteArray code_array, String subject_string, - int* output_registers, int output_register_count, int total_register_count, - int start_position, RegExp::CallOrigin call_origin, - uint32_t backtrack_limit) { - DCHECK(subject_string.IsFlat()); + Isolate* isolate, Tagged code_array, + Tagged subject_string, int* output_registers, + int output_register_count, int total_register_count, int start_position, + RegExp::CallOrigin call_origin, uint32_t backtrack_limit) { + DCHECK(subject_string->IsFlat()); // TODO(chromium:1262676): Remove this CHECK once fixed. - CHECK(code_array.IsByteArray()); + CHECK(IsByteArray(code_array)); // Note: Heap allocation *is* allowed in two situations if calling from // Runtime: @@ -1080,7 +1080,7 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( DisallowGarbageCollection no_gc; base::uc16 previous_char = '\n'; - String::FlatContent subject_content = subject_string.GetFlatContent(no_gc); + String::FlatContent subject_content = subject_string->GetFlatContent(no_gc); // Because interrupts can result in GC and string content relocation, the // checksum verification in FlatContent may fail even though this code is // safe. See (2) above. @@ -1122,10 +1122,10 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( DisallowHandleAllocation no_handles; DisallowHandleDereference no_deref; - String subject_string = String::cast(Object(subject)); - JSRegExp regexp_obj = JSRegExp::cast(Object(regexp)); + Tagged subject_string = String::cast(Tagged(subject)); + Tagged regexp_obj = JSRegExp::cast(Tagged(regexp)); - if (regexp_obj.MarkedForTierUp()) { + if (regexp_obj->MarkedForTierUp()) { // Returning RETRY will re-enter through runtime, where actual recompilation // for tier-up takes place. return IrregexpInterpreter::RETRY; diff --git a/js/src/irregexp/imported/regexp-interpreter.h b/js/src/irregexp/imported/regexp-interpreter.h index bc55be2b8c..825916291f 100644 --- a/js/src/irregexp/imported/regexp-interpreter.h +++ b/js/src/irregexp/imported/regexp-interpreter.h @@ -49,17 +49,18 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp); - static Result MatchInternal(Isolate* isolate, ByteArray code_array, - String subject_string, int* output_registers, - int output_register_count, + static Result MatchInternal(Isolate* isolate, Tagged code_array, + Tagged subject_string, + int* output_registers, int output_register_count, int total_register_count, int start_position, RegExp::CallOrigin call_origin, uint32_t backtrack_limit); private: - static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string, - int* output_registers, int output_register_count, - int start_position, RegExp::CallOrigin call_origin); + static Result Match(Isolate* isolate, Tagged regexp, + Tagged subject_string, int* output_registers, + int output_register_count, int start_position, + RegExp::CallOrigin call_origin); }; } // namespace internal diff --git a/js/src/irregexp/imported/regexp-macro-assembler.cc b/js/src/irregexp/imported/regexp-macro-assembler.cc index b4d99bf775..b99c08424e 100644 --- a/js/src/irregexp/imported/regexp-macro-assembler.cc +++ b/js/src/irregexp/imported/regexp-macro-assembler.cc @@ -182,24 +182,25 @@ uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char, static constexpr uint32_t kTrue = 1; static constexpr uint32_t kFalse = 0; - FixedUInt16Array ranges = FixedUInt16Array::cast(Object(raw_byte_array)); - DCHECK_GE(ranges.length(), 1); + Tagged ranges = + FixedUInt16Array::cast(Tagged(raw_byte_array)); + DCHECK_GE(ranges->length(), 1); // Shortcut for fully out of range chars. - if (current_char < ranges.get(0)) return kFalse; - if (current_char >= ranges.get(ranges.length() - 1)) { + if (current_char < ranges->get(0)) return kFalse; + if (current_char >= ranges->get(ranges->length() - 1)) { // The last range may be open-ended. - return (ranges.length() % 2) == 0 ? kFalse : kTrue; + return (ranges->length() % 2) == 0 ? kFalse : kTrue; } // Binary search for the matching range. `ranges` is encoded as // [from0, to0, from1, to1, ..., fromN, toN], or // [from0, to0, from1, to1, ..., fromN] (open-ended last interval). - int mid, lower = 0, upper = ranges.length(); + int mid, lower = 0, upper = ranges->length(); do { mid = lower + (upper - lower) / 2; - const base::uc16 elem = ranges.get(mid); + const base::uc16 elem = ranges->get(mid); if (current_char < elem) { upper = mid; } else if (current_char > elem) { @@ -210,7 +211,7 @@ uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char, } } while (lower < upper); - const bool current_char_ge_last_elem = current_char >= ranges.get(mid); + const bool current_char_ge_last_elem = current_char >= ranges->get(mid); const int current_range_start_index = current_char_ge_last_elem ? mid : mid - 1; @@ -277,15 +278,16 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() const { // static int NativeRegExpMacroAssembler::CheckStackGuardState( Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, - Address* return_address, InstructionStream re_code, Address* subject, - const uint8_t** input_start, const uint8_t** input_end) { + Address* return_address, Tagged re_code, + Address* subject, const uint8_t** input_start, const uint8_t** input_end, + uintptr_t gap) { DisallowGarbageCollection no_gc; Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0); - DCHECK_LE(re_code.instruction_start(), old_pc); - DCHECK_LE(old_pc, re_code.code(kAcquireLoad).instruction_end()); + DCHECK_LE(re_code->instruction_start(), old_pc); + DCHECK_LE(old_pc, re_code->code(kAcquireLoad)->instruction_end()); StackLimitCheck check(isolate); - bool js_has_overflowed = check.JsHasOverflowed(); + bool js_has_overflowed = check.JsHasOverflowed(gap); if (call_origin == RegExp::CallOrigin::kFromJs) { // Direct calls from JavaScript can be interrupted in two ways: @@ -310,7 +312,8 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( // Prepare for possible GC. HandleScope handles(isolate); Handle code_handle(re_code, isolate); - Handle subject_handle(String::cast(Object(*subject)), isolate); + Handle subject_handle(String::cast(Tagged(*subject)), + isolate); bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle); int return_value = 0; @@ -322,8 +325,8 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( return_value = EXCEPTION; } else if (check.InterruptRequested()) { AllowGarbageCollection yes_gc; - Object result = isolate->stack_guard()->HandleInterrupts(); - if (result.IsException(isolate)) return_value = EXCEPTION; + Tagged result = isolate->stack_guard()->HandleInterrupts(); + if (IsException(result, isolate)) return_value = EXCEPTION; } // We are not using operator == here because it does a slow DCHECK @@ -371,34 +374,34 @@ int NativeRegExpMacroAssembler::Match(Handle regexp, // DisallowGarbageCollection, since regexps might be preempted, and another // thread might do allocation anyway. - String subject_ptr = *subject; + Tagged subject_ptr = *subject; // Character offsets into string. int start_offset = previous_index; - int char_length = subject_ptr.length() - start_offset; + int char_length = subject_ptr->length() - start_offset; int slice_offset = 0; // The string has been flattened, so if it is a cons string it contains the // full string in the first part. if (StringShape(subject_ptr).IsCons()) { - DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length()); - subject_ptr = ConsString::cast(subject_ptr).first(); + DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length()); + subject_ptr = ConsString::cast(subject_ptr)->first(); } else if (StringShape(subject_ptr).IsSliced()) { - SlicedString slice = SlicedString::cast(subject_ptr); - subject_ptr = slice.parent(); - slice_offset = slice.offset(); + Tagged slice = SlicedString::cast(subject_ptr); + subject_ptr = slice->parent(); + slice_offset = slice->offset(); } if (StringShape(subject_ptr).IsThin()) { - subject_ptr = ThinString::cast(subject_ptr).actual(); + subject_ptr = ThinString::cast(subject_ptr)->actual(); } // Ensure that an underlying string has the same representation. - bool is_one_byte = subject_ptr.IsOneByteRepresentation(); - DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString()); + bool is_one_byte = subject_ptr->IsOneByteRepresentation(); + DCHECK(IsExternalString(subject_ptr) || IsSeqString(subject_ptr)); // String is now either Sequential or External int char_size_shift = is_one_byte ? 0 : 1; DisallowGarbageCollection no_gc; const uint8_t* input_start = - subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc); + subject_ptr->AddressOfCharacterAt(start_offset + slice_offset, no_gc); int byte_length = char_length << char_size_shift; const uint8_t* input_end = input_start + byte_length; return Execute(*subject, start_offset, input_start, input_end, offsets_vector, @@ -407,9 +410,9 @@ int NativeRegExpMacroAssembler::Match(Handle regexp, // static int NativeRegExpMacroAssembler::ExecuteForTesting( - String input, int start_offset, const uint8_t* input_start, + Tagged input, int start_offset, const uint8_t* input_start, const uint8_t* input_end, int* output, int output_size, Isolate* isolate, - JSRegExp regexp) { + Tagged regexp) { return Execute(input, start_offset, input_start, input_end, output, output_size, isolate, regexp); } @@ -419,13 +422,14 @@ int NativeRegExpMacroAssembler::ExecuteForTesting( // the signature of the interpreter. We should get rid of JS objects passed to // internal methods. int NativeRegExpMacroAssembler::Execute( - String input, // This needs to be the unpacked (sliced, cons) string. + Tagged + input, // This needs to be the unpacked (sliced, cons) string. int start_offset, const uint8_t* input_start, const uint8_t* input_end, - int* output, int output_size, Isolate* isolate, JSRegExp regexp) { + int* output, int output_size, Isolate* isolate, Tagged regexp) { RegExpStackScope stack_scope(isolate); bool is_one_byte = String::IsOneByteRepresentationUnderneath(input); - Code code = Code::cast(regexp.code(is_one_byte)); + Tagged code = Code::cast(regexp->code(isolate, is_one_byte)); RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime; using RegexpMatcherSig = @@ -439,7 +443,7 @@ int NativeRegExpMacroAssembler::Execute( output, output_size, call_origin, isolate, regexp.ptr()); DCHECK_GE(result, SMALLEST_REGEXP_RESULT); - if (result == EXCEPTION && !isolate->has_pending_exception()) { + if (result == EXCEPTION && !isolate->has_exception()) { // We detected a stack overflow (on the backtrack stack) in RegExp code, // but haven't created the exception yet. Additionally, we allow heap // allocation because even though it invalidates {input_start} and diff --git a/js/src/irregexp/imported/regexp-macro-assembler.h b/js/src/irregexp/imported/regexp-macro-assembler.h index af7e4f5297..6863adbaff 100644 --- a/js/src/irregexp/imported/regexp-macro-assembler.h +++ b/js/src/irregexp/imported/regexp-macro-assembler.h @@ -301,12 +301,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { int* offsets_vector, int offsets_vector_length, int previous_index, Isolate* isolate); - V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset, - const uint8_t* input_start, - const uint8_t* input_end, - int* output, int output_size, - Isolate* isolate, - JSRegExp regexp); + V8_EXPORT_PRIVATE static int ExecuteForTesting( + Tagged input, int start_offset, const uint8_t* input_start, + const uint8_t* input_end, int* output, int output_size, Isolate* isolate, + Tagged regexp); bool CanReadUnaligned() const override; @@ -330,9 +328,9 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { static int CheckStackGuardState(Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, Address* return_address, - InstructionStream re_code, Address* subject, - const uint8_t** input_start, - const uint8_t** input_end); + Tagged re_code, + Address* subject, const uint8_t** input_start, + const uint8_t** input_end, uintptr_t gap); static Address word_character_map_address() { return reinterpret_cast
(&word_character_map[0]); @@ -348,9 +346,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { private: // Returns a {Result} sentinel, or the number of successful matches. - static int Execute(String input, int start_offset, const uint8_t* input_start, - const uint8_t* input_end, int* output, int output_size, - Isolate* isolate, JSRegExp regexp); + static int Execute(Tagged input, int start_offset, + const uint8_t* input_start, const uint8_t* input_end, + int* output, int output_size, Isolate* isolate, + Tagged regexp); ZoneUnorderedMap> range_array_cache_; }; diff --git a/js/src/irregexp/imported/regexp-nodes.h b/js/src/irregexp/imported/regexp-nodes.h index 9407f1c5ec..f3d7e6c58f 100644 --- a/js/src/irregexp/imported/regexp-nodes.h +++ b/js/src/irregexp/imported/regexp-nodes.h @@ -318,7 +318,8 @@ class ActionNode : public SeqRegExpNode { BEGIN_NEGATIVE_SUBMATCH, POSITIVE_SUBMATCH_SUCCESS, EMPTY_MATCH_CHECK, - CLEAR_CAPTURES + CLEAR_CAPTURES, + MODIFY_FLAGS }; static ActionNode* SetRegisterForLoop(int reg, int val, RegExpNode* on_success); @@ -341,6 +342,7 @@ class ActionNode : public SeqRegExpNode { int repetition_register, int repetition_limit, RegExpNode* on_success); + static ActionNode* ModifyFlags(RegExpFlags flags, RegExpNode* on_success); void Accept(NodeVisitor* visitor) override; void Emit(RegExpCompiler* compiler, Trace* trace) override; void GetQuickCheckDetails(QuickCheckDetails* details, @@ -353,6 +355,10 @@ class ActionNode : public SeqRegExpNode { int GreedyLoopTextLength() override { return kNodeIsTooComplexForGreedyLoops; } + RegExpFlags flags() { + DCHECK_EQ(action_type(), MODIFY_FLAGS); + return RegExpFlags{data_.u_modify_flags.flags}; + } private: union { @@ -382,9 +388,13 @@ class ActionNode : public SeqRegExpNode { int range_from; int range_to; } u_clear_captures; + struct { + int flags; + } u_modify_flags; } data_; ActionNode(ActionType action_type, RegExpNode* on_success) : SeqRegExpNode(on_success), action_type_(action_type) {} + ActionType action_type_; friend class DotPrinterImpl; friend Zone; @@ -499,12 +509,11 @@ class AssertionNode : public SeqRegExpNode { class BackReferenceNode : public SeqRegExpNode { public: - BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags, - bool read_backward, RegExpNode* on_success) + BackReferenceNode(int start_reg, int end_reg, bool read_backward, + RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg), - flags_(flags), read_backward_(read_backward) {} void Accept(NodeVisitor* visitor) override; int start_register() { return start_reg_; } @@ -522,7 +531,6 @@ class BackReferenceNode : public SeqRegExpNode { private: int start_reg_; int end_reg_; - RegExpFlags flags_; bool read_backward_; }; diff --git a/js/src/irregexp/imported/regexp-parser.cc b/js/src/irregexp/imported/regexp-parser.cc index ea2a6c6d7a..965fc567b7 100644 --- a/js/src/irregexp/imported/regexp-parser.cc +++ b/js/src/irregexp/imported/regexp-parser.cc @@ -13,7 +13,7 @@ #include "unicode/unistr.h" #include "unicode/usetiter.h" #include "unicode/utf16.h" // For U16_NEXT -#endif // V8_INTL_SUPPORT +#endif // V8_INTL_SUPPORT namespace v8 { namespace internal { @@ -67,8 +67,7 @@ class RegExpTextBuilder { bool ignore_case() const { return IsIgnoreCase(flags_); } bool IsUnicodeMode() const { // Either /v or /u enable UnicodeMode - // TODO(v8:11935): Change permalink once proposal is in stage 4. - // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern + // https://tc39.es/ecma262/#sec-parsepattern return IsUnicode(flags_) || IsUnicodeSets(flags_); } Zone* zone() const { return zone_; } @@ -264,7 +263,7 @@ RegExpTree* RegExpTextBuilder::PopLastAtom() { characters_ = nullptr; atom = zone()->New(char_vector); return atom; - } else if (text_.size() > 0) { + } else if (!text_.empty()) { atom = text_.back(); text_.pop_back(); return atom; @@ -315,8 +314,7 @@ class RegExpBuilder { void FlushTerms(); bool IsUnicodeMode() const { // Either /v or /u enable UnicodeMode - // TODO(v8:11935): Change permalink once proposal is in stage 4. - // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern + // https://tc39.es/ecma262/#sec-parsepattern return IsUnicode(flags_) || IsUnicodeSets(flags_); } Zone* zone() const { return zone_; } @@ -354,7 +352,12 @@ class RegExpParserState : public ZoneObject { group_type_(group_type), lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index), - capture_name_(capture_name) {} + capture_name_(capture_name) { + if (previous_state != nullptr) { + non_participating_capture_group_interval_ = + previous_state->non_participating_capture_group_interval(); + } + } // Parser state of containing expression, if any. RegExpParserState* previous_state() const { return previous_state_; } bool IsSubexpression() { return previous_state_ != nullptr; } @@ -371,6 +374,9 @@ class RegExpParserState : public ZoneObject { // The name of the current sub-expression, if group_type is CAPTURE. Only // used for named captures. const ZoneVector* capture_name() const { return capture_name_; } + std::pair non_participating_capture_group_interval() const { + return non_participating_capture_group_interval_; + } bool IsNamedCapture() const { return capture_name_ != nullptr; } @@ -398,6 +404,18 @@ class RegExpParserState : public ZoneObject { return false; } + void NewAlternative(int captures_started) { + if (non_participating_capture_group_interval().second != 0) { + // Extend the non-participating interval. + non_participating_capture_group_interval_.second = captures_started; + } else { + // Create new non-participating interval from the start of the current + // enclosing group to all captures created within that group so far. + non_participating_capture_group_interval_ = + std::make_pair(capture_index(), captures_started); + } + } + private: // Linked list implementation of stack of states. RegExpParserState* const previous_state_; @@ -411,6 +429,11 @@ class RegExpParserState : public ZoneObject { const int disjunction_capture_index_; // Stored capture name (if any). const ZoneVector* const capture_name_; + // Interval of (named) capture indices ]from, to] that are not participating + // in the current state (i.e. they cannot match). + // Capture indices are not participating if they were created in a different + // alternative. + std::pair non_participating_capture_group_interval_; }; template @@ -463,17 +486,22 @@ class RegExpParserImpl final { RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, ClassSetOperandType* type_out, ZoneList* ranges, - CharacterClassStrings* strings); + CharacterClassStrings* strings, + base::uc32* character); base::uc32 ParseClassSetCharacter(); // Parses and returns a single escaped character. base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state, bool* is_escaped_unicode_character); + void AddMaybeSimpleCaseFoldedRange(ZoneList* ranges, + CharacterRange new_range); + RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, ClassSetOperandType first_operand_type, ZoneList* ranges, - CharacterClassStrings* strings); + CharacterClassStrings* strings, + base::uc32 first_character); RegExpTree* ParseClassIntersection(const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, ClassSetOperandType first_operand_type); @@ -504,11 +532,10 @@ class RegExpParserImpl final { int captures_started() const { return captures_started_; } int position() const { return next_pos_ - 1; } bool failed() const { return failed_; } - RegExpFlags flags() const { return top_level_flags_; } + RegExpFlags flags() const { return flags_; } bool IsUnicodeMode() const { // Either /v or /u enable UnicodeMode - // TODO(v8:11935): Change permalink once proposal is in stage 4. - // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern + // https://tc39.es/ecma262/#sec-parsepattern return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_; } bool unicode_sets() const { return IsUnicodeSets(flags()); } @@ -528,7 +555,7 @@ class RegExpParserImpl final { // Creates a new named capture at the specified index. Must be called exactly // once for each named capture. Fails if a capture with the same name is // encountered. - bool CreateNamedCaptureAtIndex(const ZoneVector* name, int index); + bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index); // Parses the name of a capture group (?pattern). The name must adhere // to IdentifierName in the ECMAScript standard. @@ -543,7 +570,7 @@ class RegExpParserImpl final { // to avoid complicating cases in which references comes before the capture. void PatchNamedBackReferences(); - ZoneVector* GetNamedCaptures() const; + ZoneVector* GetNamedCaptures(); // Returns true iff the pattern contains named captures. May call // ScanForCaptures to look ahead at the remaining pattern. @@ -593,16 +620,20 @@ class RegExpParserImpl final { RegExpError error_ = RegExpError::kNone; int error_pos_ = 0; ZoneList* captures_; - ZoneSet* named_captures_; + // Maps capture names to a list of capture indices with this name. + ZoneMap*, RegExpCaptureNameLess>* + named_captures_; ZoneList* named_back_references_; + ZoneList* temp_ranges_; const CharT* const input_; const int input_length_; base::uc32 current_; - const RegExpFlags top_level_flags_; + RegExpFlags flags_; bool force_unicode_ = false; // Force parser to act as if unicode were set. int next_pos_; int captures_started_; int capture_count_; // Only valid after we have scanned for captures. + int lookaround_count_; // Only valid after we have scanned for lookbehinds. bool has_more_; bool simple_; bool contains_anchor_; @@ -625,10 +656,11 @@ RegExpParserImpl::RegExpParserImpl( input_(input), input_length_(input_length), current_(kEndMarker), - top_level_flags_(flags), + flags_(flags), next_pos_(0), captures_started_(0), capture_count_(0), + lookaround_count_(0), has_more_(true), simple_(false), contains_anchor_(false), @@ -909,21 +941,21 @@ RegExpTree* RegExpParserImpl::ParseDisjunction() { // Build result of subexpression. if (group_type == CAPTURE) { if (state->IsNamedCapture()) { - CreateNamedCaptureAtIndex(state->capture_name(), - capture_index CHECK_FAILED); + CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED); } RegExpCapture* capture = GetCapture(capture_index); capture->set_body(body); body = capture; } else if (group_type == GROUPING) { - body = zone()->template New(body); + body = zone()->template New(body, builder->flags()); } else { DCHECK(group_type == POSITIVE_LOOKAROUND || group_type == NEGATIVE_LOOKAROUND); bool is_positive = (group_type == POSITIVE_LOOKAROUND); body = zone()->template New( body, is_positive, end_capture_index - capture_index, - capture_index, state->lookaround_type()); + capture_index, state->lookaround_type(), lookaround_count_); + lookaround_count_++; } // Restore previous state. @@ -937,6 +969,7 @@ RegExpTree* RegExpParserImpl::ParseDisjunction() { } case '|': { Advance(); + state->NewAlternative(captures_started()); builder->NewAlternative(); continue; } @@ -984,6 +1017,7 @@ RegExpTree* RegExpParserImpl::ParseDisjunction() { case '(': { state = ParseOpenParenthesis(state CHECK_FAILED); builder = state->builder(); + flags_ = builder->flags(); continue; } case '[': { @@ -1037,8 +1071,8 @@ RegExpTree* RegExpParserImpl::ParseDisjunction() { builder->AddEmpty(); } else { RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = zone()->template New( - capture, builder->flags()); + RegExpTree* atom = + zone()->template New(capture, zone()); builder->AddAtom(atom); } break; @@ -1246,43 +1280,91 @@ RegExpParserState* RegExpParserImpl::ParseOpenParenthesis( bool is_named_capture = false; const ZoneVector* capture_name = nullptr; SubexpressionType subexpr_type = CAPTURE; + RegExpFlags flags = state->builder()->flags(); + bool parsing_modifiers = false; + bool modifiers_polarity = true; + RegExpFlags modifiers; Advance(); if (current() == '?') { - switch (Next()) { - case ':': - Advance(2); - subexpr_type = GROUPING; - break; - case '=': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = POSITIVE_LOOKAROUND; - break; - case '!': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - case '<': - Advance(); - if (Next() == '=') { + do { + switch (Next()) { + case '-': + if (!v8_flags.js_regexp_modifiers) { + ReportError(RegExpError::kInvalidGroup); + return nullptr; + } + Advance(); + parsing_modifiers = true; + if (modifiers_polarity == false) { + ReportError(RegExpError::kMultipleFlagDashes); + return nullptr; + } + modifiers_polarity = false; + break; + case 'm': + case 'i': + case 's': { + if (!v8_flags.js_regexp_modifiers) { + ReportError(RegExpError::kInvalidGroup); + return nullptr; + } + Advance(); + parsing_modifiers = true; + RegExpFlag flag = TryRegExpFlagFromChar(current()).value(); + if ((modifiers & flag) != 0) { + ReportError(RegExpError::kRepeatedFlag); + return nullptr; + } + modifiers |= flag; + flags.set(flag, modifiers_polarity); + break; + } + case ':': + Advance(2); + parsing_modifiers = false; + subexpr_type = GROUPING; + break; + case '=': Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; + parsing_modifiers = false; + lookaround_type = RegExpLookaround::LOOKAHEAD; subexpr_type = POSITIVE_LOOKAROUND; break; - } else if (Next() == '!') { + case '!': Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; + parsing_modifiers = false; + lookaround_type = RegExpLookaround::LOOKAHEAD; subexpr_type = NEGATIVE_LOOKAROUND; break; - } - is_named_capture = true; - has_named_captures_ = true; - Advance(); - break; - default: - ReportError(RegExpError::kInvalidGroup); - return nullptr; + case '<': + Advance(); + parsing_modifiers = false; + if (Next() == '=') { + Advance(2); + lookaround_type = RegExpLookaround::LOOKBEHIND; + subexpr_type = POSITIVE_LOOKAROUND; + break; + } else if (Next() == '!') { + Advance(2); + lookaround_type = RegExpLookaround::LOOKBEHIND; + subexpr_type = NEGATIVE_LOOKAROUND; + break; + } + is_named_capture = true; + has_named_captures_ = true; + Advance(); + break; + default: + ReportError(RegExpError::kInvalidGroup); + return nullptr; + } + } while (parsing_modifiers); + } + if (modifiers_polarity == false) { + // We encountered a dash. + if (modifiers == 0) { + ReportError(RegExpError::kInvalidFlagGroup); + return nullptr; } } if (subexpr_type == CAPTURE) { @@ -1299,7 +1381,7 @@ RegExpParserState* RegExpParserImpl::ParseOpenParenthesis( // Store current state and begin new disjunction parsing. return zone()->template New( state, subexpr_type, lookaround_type, captures_started_, capture_name, - state->builder()->flags(), zone()); + flags, zone()); } // In order to know whether an escape is a backreference or not we have to scan @@ -1511,7 +1593,10 @@ const ZoneVector* RegExpParserImpl::ParseCaptureGroupName() { template bool RegExpParserImpl::CreateNamedCaptureAtIndex( - const ZoneVector* name, int index) { + const RegExpParserState* state, int index) { + const ZoneVector* name = state->capture_name(); + const std::pair non_participating_capture_group_interval = + state->non_participating_capture_group_interval(); DCHECK(0 < index && index <= captures_started_); DCHECK_NOT_NULL(name); @@ -1521,21 +1606,33 @@ bool RegExpParserImpl::CreateNamedCaptureAtIndex( capture->set_name(name); if (named_captures_ == nullptr) { - named_captures_ = - zone_->template New>( - zone()); + named_captures_ = zone_->template New< + ZoneMap*, RegExpCaptureNameLess>>(zone()); } else { // Check for duplicates and bail if we find any. - const auto& named_capture_it = named_captures_->find(capture); if (named_capture_it != named_captures_->end()) { - ReportError(RegExpError::kDuplicateCaptureGroupName); - return false; + if (v8_flags.js_regexp_duplicate_named_groups) { + ZoneList* named_capture_indices = named_capture_it->second; + DCHECK_NOT_NULL(named_capture_indices); + DCHECK(!named_capture_indices->is_empty()); + for (int named_index : *named_capture_indices) { + if (named_index < non_participating_capture_group_interval.first || + named_index > non_participating_capture_group_interval.second) { + ReportError(RegExpError::kDuplicateCaptureGroupName); + return false; + } + } + } else { + ReportError(RegExpError::kDuplicateCaptureGroupName); + return false; + } } } - named_captures_->emplace(capture); - + auto entry = named_captures_->try_emplace( + capture, zone()->template New>(1, zone())); + entry.first->second->Add(index, zone()); return true; } @@ -1558,7 +1655,7 @@ bool RegExpParserImpl::ParseNamedBackReference( builder->AddEmpty(); } else { RegExpBackReference* atom = - zone()->template New(builder->flags()); + zone()->template New(zone()); atom->set_name(name); builder->AddAtom(atom); @@ -1595,16 +1692,17 @@ void RegExpParserImpl::PatchNamedBackReferences() { DCHECK_NULL(search_capture->name()); search_capture->set_name(ref->name()); - int index = -1; const auto& capture_it = named_captures_->find(search_capture); - if (capture_it != named_captures_->end()) { - index = (*capture_it)->index(); - } else { + if (capture_it == named_captures_->end()) { ReportError(RegExpError::kInvalidNamedCaptureReference); return; } - ref->set_capture(GetCapture(index)); + DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups, + capture_it->second->length() == 1); + for (int index : *capture_it->second) { + ref->add_capture(GetCapture(index), zone()); + } } } @@ -1627,13 +1725,22 @@ RegExpCapture* RegExpParserImpl::GetCapture(int index) { } template -ZoneVector* RegExpParserImpl::GetNamedCaptures() const { - if (named_captures_ == nullptr || named_captures_->empty()) { +ZoneVector* RegExpParserImpl::GetNamedCaptures() { + if (named_captures_ == nullptr) { return nullptr; } + DCHECK(!named_captures_->empty()); - return zone()->template New>( - named_captures_->begin(), named_captures_->end(), zone()); + ZoneVector* flattened_named_captures = + zone()->template New>(zone()); + for (auto capture : *named_captures_) { + DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups, + capture.second->length() == 1); + for (int index : *capture.second) { + flattened_named_captures->push_back(GetCapture(index)); + } + } + return flattened_named_captures; } template @@ -1890,7 +1997,7 @@ bool LookupPropertyValueName(UProperty property, ExtractStringsFromUnicodeSet(set, result_strings, flags, zone); } const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags); - if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set); + if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE); set.removeAllStrings(); if (negate) set.complement(); for (int i = 0; i < set.getRangeCount(); i++) { @@ -2096,13 +2203,22 @@ bool RegExpParserImpl::AddPropertyClassRange( if (!IsSupportedBinaryProperty(property, unicode_sets())) return false; if (!IsExactPropertyAlias(name, property)) return false; // Negation of properties with strings is not allowed. - // TODO(v8:11935): Change permalink once proposal is in stage 4. // See - // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-static-semantics-maycontainstrings + // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings if (negate && IsBinaryPropertyOfStrings(property)) return false; - return LookupPropertyValueName(property, negate ? "N" : "Y", false, - add_to_ranges, add_to_strings, flags(), - zone()); + if (unicode_sets()) { + // In /v mode we can't simple lookup the "false" binary property values, + // as the spec requires us to perform case folding before calculating the + // complement. + // See https://tc39.es/ecma262/#sec-compiletocharset + // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue + return LookupPropertyValueName(property, "Y", negate, add_to_ranges, + add_to_strings, flags(), zone()); + } else { + return LookupPropertyValueName(property, negate ? "N" : "Y", false, + add_to_ranges, add_to_strings, flags(), + zone()); + } } else { // Both property name and value name are specified. Attempt to interpret // the property name as enumerated property. @@ -2325,8 +2441,7 @@ base::uc32 RegExpParserImpl::ParseCharacterEscape( return c; } -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassRanges +// https://tc39.es/ecma262/#prod-ClassRanges template RegExpTree* RegExpParserImpl::ParseClassRanges( ZoneList* ranges, bool add_unicode_case_equivalents) { @@ -2475,8 +2590,7 @@ void AddClassString(ZoneList* normalized_string, } // namespace -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassStringDisjunction +// https://tc39.es/ecma262/#prod-ClassStringDisjunction template RegExpTree* RegExpParserImpl::ParseClassStringDisjunction( ZoneList* ranges, CharacterClassStrings* strings) { @@ -2526,8 +2640,7 @@ RegExpTree* RegExpParserImpl::ParseClassStringDisjunction( return nullptr; } -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand +// https://tc39.es/ecma262/#prod-ClassSetOperand // Tree returned based on type_out: // * kNestedClass: RegExpClassSetExpression // * For all other types: RegExpClassSetOperand @@ -2538,12 +2651,13 @@ RegExpTree* RegExpParserImpl::ParseClassSetOperand( zone()->template New>(1, zone()); CharacterClassStrings* strings = zone()->template New(zone()); - RegExpTree* tree = - ParseClassSetOperand(builder, type_out, ranges, strings CHECK_FAILED); + base::uc32 character; + RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings, + &character CHECK_FAILED); DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass, tree == nullptr); DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, - ranges->length() == 1); + ranges->is_empty()); DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, strings->empty()); DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, @@ -2558,21 +2672,27 @@ RegExpTree* RegExpParserImpl::ParseClassSetOperand( // CharacterClassEscape includes \p{}, which can contain ranges, strings or // both and \P{}, which could contain nothing (i.e. \P{Any}). if (tree == nullptr) { + if (*type_out == ClassSetOperandType::kClassSetCharacter) { + AddMaybeSimpleCaseFoldedRange(ranges, + CharacterRange::Singleton(character)); + } tree = zone()->template New(ranges, strings); } return tree; } -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand -// Based on |type_out| either a tree is returned or ranges/strings modified. -// If a tree is returned, ranges/strings are not modified. -// If |type_out| is kNestedClass, a tree of type RegExpClassSetExpression is -// returned. For all other types, ranges is modified and nullptr is returned. +// https://tc39.es/ecma262/#prod-ClassSetOperand +// Based on |type_out| either a tree is returned or +// |ranges|/|strings|/|character| modified. If a tree is returned, +// ranges/strings are not modified. If |type_out| is kNestedClass, a tree of +// type RegExpClassSetExpression is returned. If | type_out| is +// kClassSetCharacter, |character| is set and nullptr returned. For all other +// types, |ranges|/|strings|/|character| is modified and nullptr is returned. template RegExpTree* RegExpParserImpl::ParseClassSetOperand( const RegExpBuilder* builder, ClassSetOperandType* type_out, - ZoneList* ranges, CharacterClassStrings* strings) { + ZoneList* ranges, CharacterClassStrings* strings, + base::uc32* character) { DCHECK(unicode_sets()); base::uc32 c = current(); if (c == '\\') { @@ -2599,7 +2719,7 @@ RegExpTree* RegExpParserImpl::ParseClassSetOperand( *type_out = ClassSetOperandType::kClassSetCharacter; c = ParseClassSetCharacter(CHECK_FAILED); - ranges->Add(CharacterRange::Singleton(c), zone()); + *character = c; return nullptr; } @@ -2653,13 +2773,28 @@ bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) { } // namespace -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassUnion +template +void RegExpParserImpl::AddMaybeSimpleCaseFoldedRange( + ZoneList* ranges, CharacterRange new_range) { + DCHECK(unicode_sets()); + if (ignore_case()) { + ZoneList* new_ranges = + zone()->template New>(2, zone()); + new_ranges->Add(new_range, zone()); + CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone()); + ranges->AddAll(*new_ranges, zone()); + } else { + ranges->Add(new_range, zone()); + } + CharacterRange::Canonicalize(ranges); +} + +// https://tc39.es/ecma262/#prod-ClassUnion template RegExpTree* RegExpParserImpl::ParseClassUnion( const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, ClassSetOperandType first_operand_type, ZoneList* ranges, - CharacterClassStrings* strings) { + CharacterClassStrings* strings, base::uc32 character) { DCHECK(unicode_sets()); ZoneList* operands = zone()->template New>(2, zone()); @@ -2673,7 +2808,6 @@ RegExpTree* RegExpParserImpl::ParseClassUnion( operands->Add(first_operand, zone()); } ClassSetOperandType last_type = first_operand_type; - const bool needs_case_folding = ignore_case(); while (has_more() && current() != ']') { if (current() == '-') { // Mix of ClassSetRange and ClassSubtraction is not allowed. @@ -2690,42 +2824,36 @@ RegExpTree* RegExpParserImpl::ParseClassUnion( // represent a character range. // In case one of them is not a ClassSetCharacter, it is a syntax error, // as '-' can not be used unescaped within a class with /v. - // TODO(v8:11935): Change permalink once proposal is in stage 4. // See - // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetRange + // https://tc39.es/ecma262/#prod-ClassSetRange if (last_type != ClassSetOperandType::kClassSetCharacter) { return ReportError(RegExpError::kInvalidCharacterClass); } - ParseClassSetOperand(builder, &last_type, ranges, strings CHECK_FAILED); + base::uc32 from = character; + ParseClassSetOperand(builder, &last_type, ranges, strings, + &character CHECK_FAILED); if (last_type != ClassSetOperandType::kClassSetCharacter) { return ReportError(RegExpError::kInvalidCharacterClass); } - // Remove the last two singleton characters added to ranges, and combine - // them into a range. - auto rhs_ranges = ranges->RemoveLast(); - auto lhs_ranges = ranges->RemoveLast(); - DCHECK(lhs_ranges.IsSingleton()); - DCHECK(rhs_ranges.IsSingleton()); - base::uc32 from = lhs_ranges.from(); - base::uc32 to = rhs_ranges.from(); - if (from > to) { + if (from > character) { return ReportError(RegExpError::kOutOfOrderCharacterClass); } - ranges->Add(CharacterRange::Range(from, to), zone()); + AddMaybeSimpleCaseFoldedRange(ranges, + CharacterRange::Range(from, character)); last_type = ClassSetOperandType::kClassSetRange; } else { DCHECK_NE(current(), '-'); - RegExpTree* operand = ParseClassSetOperand(builder, &last_type, ranges, - strings CHECK_FAILED); + if (last_type == ClassSetOperandType::kClassSetCharacter) { + AddMaybeSimpleCaseFoldedRange(ranges, + CharacterRange::Singleton(character)); + } + RegExpTree* operand = ParseClassSetOperand( + builder, &last_type, ranges, strings, &character CHECK_FAILED); if (operand != nullptr) { may_contain_strings |= MayContainStrings(last_type, operand); // Add the range we started building as operand and reset the current // range. if (!ranges->is_empty() || !strings->empty()) { - if (needs_case_folding) { - CharacterRange::Canonicalize(ranges); - CharacterRange::AddUnicodeCaseEquivalents(ranges, zone()); - } may_contain_strings |= !strings->empty(); operands->Add( zone()->template New(ranges, strings), @@ -2742,12 +2870,12 @@ RegExpTree* RegExpParserImpl::ParseClassUnion( return ReportError(RegExpError::kUnterminatedCharacterClass); } + if (last_type == ClassSetOperandType::kClassSetCharacter) { + AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character)); + } + // Add the range we started building as operand. if (!ranges->is_empty() || !strings->empty()) { - if (needs_case_folding) { - CharacterRange::Canonicalize(ranges); - CharacterRange::AddUnicodeCaseEquivalents(ranges, zone()); - } may_contain_strings |= !strings->empty(); operands->Add(zone()->template New(ranges, strings), zone()); @@ -2773,8 +2901,7 @@ RegExpTree* RegExpParserImpl::ParseClassUnion( may_contain_strings, operands); } -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassIntersection +// https://tc39.es/ecma262/#prod-ClassIntersection template RegExpTree* RegExpParserImpl::ParseClassIntersection( const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, @@ -2815,8 +2942,7 @@ RegExpTree* RegExpParserImpl::ParseClassIntersection( may_contain_strings, operands); } -// TODO(v8:11935): Change permalink once proposal is in stage 4. -// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSubtraction +// https://tc39.es/ecma262/#prod-ClassSubtraction template RegExpTree* RegExpParserImpl::ParseClassSubtraction( const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, @@ -2891,12 +3017,16 @@ RegExpTree* RegExpParserImpl::ParseCharacterClass( ClassSetOperandType operand_type; CharacterClassStrings* strings = zone()->template New(zone()); - RegExpTree* operand = ParseClassSetOperand(builder, &operand_type, ranges, - strings CHECK_FAILED); + base::uc32 character; + RegExpTree* operand = ParseClassSetOperand( + builder, &operand_type, ranges, strings, &character CHECK_FAILED); switch (current()) { case '-': if (Next() == '-') { if (operand == nullptr) { + if (operand_type == ClassSetOperandType::kClassSetCharacter) { + ranges->Add(CharacterRange::Singleton(character), zone()); + } operand = zone()->template New(ranges, strings); } @@ -2908,6 +3038,9 @@ RegExpTree* RegExpParserImpl::ParseCharacterClass( case '&': if (Next() == '&') { if (operand == nullptr) { + if (operand_type == ClassSetOperandType::kClassSetCharacter) { + ranges->Add(CharacterRange::Singleton(character), zone()); + } operand = zone()->template New(ranges, strings); } @@ -2916,7 +3049,7 @@ RegExpTree* RegExpParserImpl::ParseCharacterClass( } } return ParseClassUnion(builder, is_negated, operand, operand_type, ranges, - strings); + strings, character); } } @@ -3047,7 +3180,7 @@ bool RegExpBuilder::AddQuantifierToAtom( RegExpTree* atom = text_builder().PopLastAtom(); if (atom != nullptr) { FlushText(); - } else if (terms_.size() > 0) { + } else if (!terms_.empty()) { atom = terms_.back(); terms_.pop_back(); if (atom->IsLookaround()) { diff --git a/js/src/irregexp/imported/regexp.h b/js/src/irregexp/imported/regexp.h index 50269a4b71..5dc9070ed9 100644 --- a/js/src/irregexp/imported/regexp.h +++ b/js/src/irregexp/imported/regexp.h @@ -87,8 +87,8 @@ class RegExp final : public AllStatic { RegExpFlags flags, uint32_t backtrack_limit); // Ensures that a regexp is fully compiled and ready to be executed on a - // subject string. Returns true on success. Return false on failure, and - // then an exception will be pending. + // subject string. Returns true on success. Throw and return false on + // failure. V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate, Handle re, Handle subject); @@ -211,14 +211,16 @@ class RegExpResultsCache final : public AllStatic { // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi. // On success, the returned result is guaranteed to be a COW-array. - static Object Lookup(Heap* heap, String key_string, Object key_pattern, - FixedArray* last_match_out, ResultsCacheType type); + static Tagged Lookup(Heap* heap, Tagged key_string, + Tagged key_pattern, + Tagged* last_match_out, + ResultsCacheType type); // Attempt to add value_array to the cache specified by type. On success, // value_array is turned into a COW-array. static void Enter(Isolate* isolate, Handle key_string, Handle key_pattern, Handle value_array, Handle last_match_cache, ResultsCacheType type); - static void Clear(FixedArray cache); + static void Clear(Tagged cache); static constexpr int kRegExpResultsCacheSize = 0x100; diff --git a/js/src/irregexp/imported/special-case.cc b/js/src/irregexp/imported/special-case.cc index f5a9928b3a..d40ada6bb9 100644 --- a/js/src/irregexp/imported/special-case.cc +++ b/js/src/irregexp/imported/special-case.cc @@ -82,29 +82,6 @@ const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() { return set.Pointer()->set; } -icu::UnicodeSet BuildUnicodeNonSimpleCloseOverSet() { - icu::UnicodeSet set; - set.add(0x390); - set.add(0x3b0); - set.add(0x1fd3); - set.add(0x1fe3); - set.add(0xfb05, 0xfb06); - set.freeze(); - return set; -} - -struct UnicodeNonSimpleCloseOverSetData { - UnicodeNonSimpleCloseOverSetData() : set(BuildUnicodeNonSimpleCloseOverSet()) {} - const icu::UnicodeSet set; -}; - -//static -const icu::UnicodeSet& RegExpCaseFolding::UnicodeNonSimpleCloseOverSet() { - static base::LazyInstance::type set = - LAZY_INSTANCE_INITIALIZER; - return set.Pointer()->set; -} - } // namespace internal } // namespace v8 diff --git a/js/src/irregexp/imported/special-case.h b/js/src/irregexp/imported/special-case.h index ea511af5a4..050d72a064 100644 --- a/js/src/irregexp/imported/special-case.h +++ b/js/src/irregexp/imported/special-case.h @@ -70,21 +70,11 @@ namespace internal { // another character. Characters that match no other characters in // their equivalence class are added to IgnoreSet. Characters that // match at least one other character are added to SpecialAddSet. -// -// For unicode ignoreCase ("iu" and "iv"), -// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in -// the same equivalence class. This includes characaters that are in the same -// equivalence class using full case folding. According to the spec, only -// simple case folding shall be considered. We therefore create -// UnicodeNonSimpleCloseOverSet containing all characters for which -// UnicodeSet::closeOver adds characters that are not simple case folds. This -// set should be used similar to IgnoreSet described above. class RegExpCaseFolding final : public AllStatic { public: static const icu::UnicodeSet& IgnoreSet(); static const icu::UnicodeSet& SpecialAddSet(); - static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet(); // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: // Canonicalize) step 3, which is used to determine whether -- cgit v1.2.3