summaryrefslogtreecommitdiffstats
path: root/js/src/irregexp
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/irregexp')
-rw-r--r--js/src/irregexp/RegExpAPI.cpp4
-rw-r--r--js/src/irregexp/RegExpNativeMacroAssembler.cpp4
-rw-r--r--js/src/irregexp/RegExpShim.cpp6
-rw-r--r--js/src/irregexp/RegExpShim.h82
-rw-r--r--js/src/irregexp/RegExpTypes.h18
-rw-r--r--js/src/irregexp/imported/gen-regexp-special-case.cc48
-rw-r--r--js/src/irregexp/imported/regexp-ast.cc23
-rw-r--r--js/src/irregexp/imported/regexp-ast.h42
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-generator.cc2
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-peephole.cc4
-rw-r--r--js/src/irregexp/imported/regexp-compiler-tonode.cc99
-rw-r--r--js/src/irregexp/imported/regexp-compiler.cc43
-rw-r--r--js/src/irregexp/imported/regexp-compiler.h8
-rw-r--r--js/src/irregexp/imported/regexp-dotprinter.cc4
-rw-r--r--js/src/irregexp/imported/regexp-interpreter.cc84
-rw-r--r--js/src/irregexp/imported/regexp-interpreter.h13
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler.cc70
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler.h23
-rw-r--r--js/src/irregexp/imported/regexp-nodes.h18
-rw-r--r--js/src/irregexp/imported/regexp-parser.cc393
-rw-r--r--js/src/irregexp/imported/regexp.h12
-rw-r--r--js/src/irregexp/imported/special-case.cc23
-rw-r--r--js/src/irregexp/imported/special-case.h10
-rw-r--r--js/src/irregexp/moz.build8
-rw-r--r--js/src/irregexp/moz.yaml4
25 files changed, 618 insertions, 427 deletions
diff --git a/js/src/irregexp/RegExpAPI.cpp b/js/src/irregexp/RegExpAPI.cpp
index f1ba1fbc4b..39a6f8ccc9 100644
--- a/js/src/irregexp/RegExpAPI.cpp
+++ b/js/src/irregexp/RegExpAPI.cpp
@@ -632,7 +632,7 @@ enum class AssembleResult {
// RegExpShared.
ByteArray bytecode =
v8::internal::ByteArray::cast(*result.code).takeOwnership(cx->isolate);
- uint32_t length = bytecode->length;
+ uint32_t length = bytecode->length();
re->setByteCode(bytecode.release(), isLatin1);
js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode);
}
@@ -773,7 +773,7 @@ bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
bool isLatin1 = input->hasLatin1Chars();
SampleCharacters(input, compiler);
- data.node = compiler.PreprocessRegExp(&data, flags, isLatin1);
+ data.node = compiler.PreprocessRegExp(&data, isLatin1);
data.error = AnalyzeRegExp(cx->isolate, isLatin1, flags, data.node);
if (data.error != RegExpError::kNone) {
MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow);
diff --git a/js/src/irregexp/RegExpNativeMacroAssembler.cpp b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
index 2a8b1749c2..99cfc31bfc 100644
--- a/js/src/irregexp/RegExpNativeMacroAssembler.cpp
+++ b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
@@ -247,8 +247,8 @@ void SMRegExpMacroAssembler::CheckCharacterNotInRange(base::uc16 from,
bool SMRegExpMacroAssembler::IsCharacterInRangeArray(uint32_t c,
ByteArrayData* ranges) {
js::AutoUnsafeCallWithABI unsafe;
- MOZ_ASSERT(ranges->length % sizeof(uint16_t) == 0);
- uint32_t length = ranges->length / sizeof(uint16_t);
+ MOZ_ASSERT(ranges->length() % sizeof(uint16_t) == 0);
+ uint32_t length = ranges->length() / sizeof(uint16_t);
MOZ_ASSERT(length > 0);
// Fast paths.
diff --git a/js/src/irregexp/RegExpShim.cpp b/js/src/irregexp/RegExpShim.cpp
index 2b2c3cd4a0..da388e0057 100644
--- a/js/src/irregexp/RegExpShim.cpp
+++ b/js/src/irregexp/RegExpShim.cpp
@@ -227,13 +227,13 @@ Handle<ByteArray> Isolate::NewByteArray(int length, AllocationType alloc) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
- size_t alloc_size = sizeof(uint32_t) + length;
+ size_t alloc_size = sizeof(ByteArrayData) + length;
ByteArrayData* data =
static_cast<ByteArrayData*>(allocatePseudoHandle(alloc_size));
if (!data) {
oomUnsafe.crash("Irregexp NewByteArray");
}
- data->length = length;
+ new (data) ByteArrayData(length);
return Handle<ByteArray>(JS::PrivateValue(data), this);
}
@@ -261,7 +261,7 @@ Handle<FixedIntegerArray<T>> Isolate::NewFixedIntegerArray(uint32_t length) {
if (!data) {
oomUnsafe.crash("Irregexp NewFixedIntegerArray");
}
- data->length = rawLength;
+ new (data) ByteArrayData(rawLength);
return Handle<FixedIntegerArray<T>>(JS::PrivateValue(data), this);
}
diff --git a/js/src/irregexp/RegExpShim.h b/js/src/irregexp/RegExpShim.h
index 3f85413421..4d32c84920 100644
--- a/js/src/irregexp/RegExpShim.h
+++ b/js/src/irregexp/RegExpShim.h
@@ -586,15 +586,6 @@ class Object {
// IsCharacterInRangeArray in regexp-macro-assembler.cc.
Object(uintptr_t raw) : asBits_(raw) { MOZ_CRASH("unused"); }
- // Used in regexp-interpreter.cc to check the return value of
- // isolate->stack_guard()->HandleInterrupts(). We want to handle
- // interrupts in the caller, so we always return false from
- // HandleInterrupts and true here.
- inline bool IsException(Isolate*) const {
- MOZ_ASSERT(!value().toBoolean());
- return true;
- }
-
JS::Value value() const { return JS::Value::fromRawBits(asBits_); }
inline static Object cast(Object object) { return object; }
@@ -604,6 +595,14 @@ class Object {
uint64_t asBits_;
} JS_HAZ_GC_POINTER;
+// Used in regexp-interpreter.cc to check the return value of
+// isolate->stack_guard()->HandleInterrupts(). We want to handle
+// interrupts in the caller, so we return a magic value from
+// HandleInterrupts and check for it here.
+inline bool IsException(Object obj, Isolate*) {
+ return obj.value().isMagic(JS_INTERRUPT_REGEXP);
+}
+
class Smi : public Object {
public:
static Smi FromInt(int32_t value) {
@@ -626,6 +625,27 @@ class HeapObject : public Object {
}
};
+// V8's values use low-bit tagging. If the LSB is 0, it's a small
+// integer. If the LSB is 1, it's a pointer to some GC thing. In V8,
+// this wrapper class is used to represent a pointer that has the low
+// bit set, or a small integer that has been shifted left by one
+// bit. We don't use the same tagging system, so all we need is a
+// transparent wrapper that automatically converts to/from the wrapped
+// type.
+template <typename T>
+class Tagged {
+ public:
+ Tagged() {}
+ MOZ_IMPLICIT Tagged(const T& value) : value_(value) {}
+ MOZ_IMPLICIT Tagged(T&& value) : value_(std::move(value)) {}
+
+ T* operator->() { return &value_; }
+ constexpr operator T() const { return value_; }
+
+ private:
+ T value_;
+};
+
// A fixed-size array with Objects (aka Values) as element types.
// Implemented using the dense elements of an ArrayObject.
// Used for named captures.
@@ -668,13 +688,13 @@ T* ByteArrayData::typedData() {
template <typename T>
T ByteArrayData::getTyped(uint32_t index) {
- MOZ_ASSERT(index < length / sizeof(T));
+ MOZ_ASSERT(index < length() / sizeof(T));
return typedData<T>()[index];
}
template <typename T>
void ByteArrayData::setTyped(uint32_t index, T value) {
- MOZ_ASSERT(index < length / sizeof(T));
+ MOZ_ASSERT(index < length() / sizeof(T));
typedData<T>()[index] = value;
}
@@ -684,6 +704,7 @@ class ByteArray : public HeapObject {
ByteArrayData* inner() const {
return static_cast<ByteArrayData*>(value().toPrivate());
}
+ friend bool IsByteArray(Object obj);
public:
PseudoHandle<ByteArrayData> takeOwnership(Isolate* isolate);
@@ -692,8 +713,8 @@ class ByteArray : public HeapObject {
uint8_t get(uint32_t index) { return inner()->get(index); }
void set(uint32_t index, uint8_t val) { inner()->set(index, val); }
- uint32_t length() const { return inner()->length; }
- uint8_t* GetDataStartAddress() { return inner()->data(); }
+ uint32_t length() const { return inner()->length(); }
+ uint8_t* begin() { return inner()->data(); }
static ByteArray cast(Object object) {
ByteArray b;
@@ -701,11 +722,17 @@ class ByteArray : public HeapObject {
return b;
}
- bool IsByteArray() const { return true; }
-
friend class SMRegExpMacroAssembler;
};
+// This is only used in assertions. In debug builds, we put a magic value
+// in the header of each ByteArrayData, and assert here that it matches.
+inline bool IsByteArray(Object obj) {
+ MOZ_ASSERT(ByteArray::cast(obj).inner()->magic() ==
+ ByteArrayData::ExpectedMagic);
+ return true;
+}
+
// This is a convenience class used in V8 for treating a ByteArray as an array
// of fixed-size integers. This version supports integral types up to 32 bits.
template <typename T>
@@ -1030,6 +1057,7 @@ class JSRegExp : public HeapObject {
};
using RegExpFlags = JS::RegExpFlags;
+using RegExpFlag = JS::RegExpFlags::Flag;
inline bool IsUnicode(RegExpFlags flags) { return flags.unicode(); }
inline bool IsGlobal(RegExpFlags flags) { return flags.global(); }
@@ -1042,6 +1070,22 @@ inline bool IsEitherUnicode(RegExpFlags flags) {
return flags.unicode() || flags.unicodeSets();
}
+inline base::Optional<RegExpFlag> TryRegExpFlagFromChar(char c) {
+ RegExpFlag flag;
+
+ // The parser only calls this after verifying that it's a supported flag.
+ MOZ_ALWAYS_TRUE(JS::MaybeParseRegExpFlag(c, &flag));
+
+ return base::Optional(flag);
+}
+
+inline bool operator==(const RegExpFlags& lhs, const int& rhs) {
+ return lhs.value() == rhs;
+}
+inline bool operator!=(const RegExpFlags& lhs, const int& rhs) {
+ return !(lhs == rhs);
+}
+
class Histogram {
public:
inline void AddSample(int sample) {}
@@ -1126,9 +1170,11 @@ class Isolate {
// This is called from inside no-GC code. V8 runs the interrupt
// inside the no-GC code and then "manually relocates unhandlified
- // references" afterwards. We just return false and let the caller
- // handle interrupts.
- Object HandleInterrupts() { return Object(JS::BooleanValue(false)); }
+ // references" afterwards. We just return a magic value and let the
+ // caller handle interrupts.
+ Object HandleInterrupts() {
+ return Object(JS::MagicValue(JS_INTERRUPT_REGEXP));
+ }
JSContext* cx() const { return cx_; }
diff --git a/js/src/irregexp/RegExpTypes.h b/js/src/irregexp/RegExpTypes.h
index e2a619689c..620fac4ed5 100644
--- a/js/src/irregexp/RegExpTypes.h
+++ b/js/src/irregexp/RegExpTypes.h
@@ -21,15 +21,17 @@ namespace internal {
class ByteArrayData {
public:
- uint32_t length;
+ ByteArrayData(uint32_t length) : length_(length) {}
+
+ uint32_t length() { return length_; };
uint8_t* data();
uint8_t get(uint32_t index) {
- MOZ_ASSERT(index < length);
+ MOZ_ASSERT(index < length());
return data()[index];
}
void set(uint32_t index, uint8_t val) {
- MOZ_ASSERT(index < length);
+ MOZ_ASSERT(index < length());
data()[index] = val;
}
@@ -39,9 +41,19 @@ class ByteArrayData {
template <typename T>
void setTyped(uint32_t index, T value);
+#ifdef DEBUG
+ const static uint32_t ExpectedMagic = 0x12344321;
+ uint32_t magic() const { return magic_; }
+
+ private:
+ uint32_t magic_ = ExpectedMagic;
+#endif
+
private:
template <typename T>
T* typedData();
+
+ uint32_t length_;
};
class Isolate;
diff --git a/js/src/irregexp/imported/gen-regexp-special-case.cc b/js/src/irregexp/imported/gen-regexp-special-case.cc
index 8f6557ed30..0875568250 100644
--- a/js/src/irregexp/imported/gen-regexp-special-case.cc
+++ b/js/src/irregexp/imported/gen-regexp-special-case.cc
@@ -8,7 +8,6 @@
#include <sstream>
#include "irregexp/imported/special-case.h"
-#include "unicode/usetiter.h"
namespace v8 {
namespace internal {
@@ -126,52 +125,6 @@ void PrintSpecial(std::ofstream& out) {
PrintSet(out, "SpecialAddSet", special_add);
}
-void PrintUnicodeSpecial(std::ofstream& out) {
- icu::UnicodeSet non_simple_folding;
- icu::UnicodeSet current;
- UErrorCode status = U_ZERO_ERROR;
- // Look at all characters except white spaces.
- icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status);
- CHECK_EQ(status, U_ZERO_ERROR);
- icu::UnicodeSetIterator iter(interestingCP);
- while (iter.next()) {
- UChar32 c = iter.getCodepoint();
- current.set(c, c);
- current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings();
- CHECK(!current.isBogus());
- // Remove characters from the closeover that have a simple case folding.
- icu::UnicodeSet toRemove;
- icu::UnicodeSetIterator closeOverIter(current);
- while (closeOverIter.next()) {
- UChar32 closeOverChar = closeOverIter.getCodepoint();
- UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT);
- if (closeOverChar != closeOverSCF) {
- toRemove.add(closeOverChar);
- }
- }
- CHECK(!toRemove.isBogus());
- current.removeAll(toRemove);
-
- // The current character and its simple case folding are also always OK.
- UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT);
- current.remove(c);
- current.remove(scf);
-
- // If there are any characters remaining, they were added due to full case
- // foldings and shouldn't match the current charcter according to the spec.
- if (!current.isEmpty()) {
- // Ensure that the character doesn't have a simple case folding.
- // Otherwise the current approach of simply removing the character from
- // the set before calling closeOver won't work.
- CHECK_EQ(c, scf);
- non_simple_folding.add(c);
- }
- }
- CHECK(!non_simple_folding.isBogus());
-
- PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding);
-}
-
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
@@ -192,7 +145,6 @@ void WriteHeader(const char* header_filename) {
<< "namespace internal {\n\n";
PrintSpecial(out);
- PrintUnicodeSpecial(out);
out << "\n"
<< "} // namespace internal\n"
diff --git a/js/src/irregexp/imported/regexp-ast.cc b/js/src/irregexp/imported/regexp-ast.cc
index 63eeb5c05d..34946bd80c 100644
--- a/js/src/irregexp/imported/regexp-ast.cc
+++ b/js/src/irregexp/imported/regexp-ast.cc
@@ -307,7 +307,7 @@ void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
}
void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) {
- os_ << "(?: ";
+ os_ << "(?" << that->flags() << ": ";
that->body()->Accept(this, data);
os_ << ")";
return nullptr;
@@ -325,7 +325,11 @@ void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
void* data) {
- os_ << "(<- " << that->index() << ")";
+ os_ << "(<- " << that->captures()->first()->index();
+ for (int i = 1; i < that->captures()->length(); ++i) {
+ os_ << "," << that->captures()->at(i)->index();
+ }
+ os_ << ")";
return nullptr;
}
@@ -406,10 +410,17 @@ RegExpClassSetExpression::RegExpClassSetExpression(
may_contain_strings_(may_contain_strings),
operands_(operands) {
DCHECK_NOT_NULL(operands);
- DCHECK_IMPLIES(is_negated_, !may_contain_strings_);
- max_match_ = 0;
- for (auto op : *operands) {
- max_match_ = std::max(max_match_, op->max_match());
+ if (is_negated) {
+ DCHECK(!may_contain_strings_);
+ // We don't know anything about max matches for negated classes.
+ // As there are no strings involved, assume that we can match a unicode
+ // character (2 code points).
+ max_match_ = 2;
+ } else {
+ max_match_ = 0;
+ for (auto op : *operands) {
+ max_match_ = std::max(max_match_, op->max_match());
+ }
}
}
diff --git a/js/src/irregexp/imported/regexp-ast.h b/js/src/irregexp/imported/regexp-ast.h
index af90b1dda3..b2b88515d3 100644
--- a/js/src/irregexp/imported/regexp-ast.h
+++ b/js/src/irregexp/imported/regexp-ast.h
@@ -130,12 +130,6 @@ class CharacterRange {
static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
Zone* zone);
-#ifdef V8_INTL_SUPPORT
- // Creates the closeOver of the given UnicodeSet, removing all
- // characters/strings that can't be derived via simple case folding.
- static void UnicodeSimpleCloseOver(icu::UnicodeSet& set);
-#endif // V8_INTL_SUPPORT
-
bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
base::uc32 from() const { return from_; }
base::uc32 to() const { return to_; }
@@ -311,9 +305,12 @@ class RegExpClassRanges final : public RegExpTree {
// the specified ranges.
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
// surrogate and should not be unicode-desugared (crbug.com/641091).
+ // IS_CASE_FOLDED: If case folding is required (/i), it was already
+ // performed on individual ranges and should not be applied again.
enum Flag {
NEGATED = 1 << 0,
CONTAINS_SPLIT_SURROGATE = 1 << 1,
+ IS_CASE_FOLDED = 1 << 2,
};
using ClassRangesFlags = base::Flags<Flag>;
@@ -356,6 +353,9 @@ class RegExpClassRanges final : public RegExpTree {
bool contains_split_surrogate() const {
return (class_ranges_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
}
+ bool is_case_folded() const {
+ return (class_ranges_flags_ & IS_CASE_FOLDED) != 0;
+ }
private:
CharacterSet set_;
@@ -626,8 +626,9 @@ class RegExpCapture final : public RegExpTree {
class RegExpGroup final : public RegExpTree {
public:
- explicit RegExpGroup(RegExpTree* body)
+ explicit RegExpGroup(RegExpTree* body, RegExpFlags flags)
: body_(body),
+ flags_(flags),
min_match_(body->min_match()),
max_match_(body->max_match()) {}
@@ -639,9 +640,11 @@ class RegExpGroup final : public RegExpTree {
int max_match() override { return max_match_; }
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
RegExpTree* body() const { return body_; }
+ RegExpFlags flags() const { return flags_; }
private:
RegExpTree* body_;
+ const RegExpFlags flags_;
int min_match_;
int max_match_;
};
@@ -651,12 +654,13 @@ class RegExpLookaround final : public RegExpTree {
enum Type { LOOKAHEAD, LOOKBEHIND };
RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
- int capture_from, Type type)
+ int capture_from, Type type, int index)
: body_(body),
is_positive_(is_positive),
capture_count_(capture_count),
capture_from_(capture_from),
- type_(type) {}
+ type_(type),
+ index_(index) {}
DECL_BOILERPLATE(Lookaround);
@@ -669,6 +673,7 @@ class RegExpLookaround final : public RegExpTree {
int capture_count() const { return capture_count_; }
int capture_from() const { return capture_from_; }
Type type() const { return type_; }
+ int index() const { return index_; }
class Builder {
public:
@@ -692,14 +697,17 @@ class RegExpLookaround final : public RegExpTree {
int capture_count_;
int capture_from_;
Type type_;
+ int index_;
};
class RegExpBackReference final : public RegExpTree {
public:
- explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {}
- RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
- : capture_(capture), flags_(flags) {}
+ explicit RegExpBackReference(Zone* zone) : captures_(1, zone) {}
+ explicit RegExpBackReference(RegExpCapture* capture, Zone* zone)
+ : captures_(1, zone) {
+ captures_.Add(capture, zone);
+ }
DECL_BOILERPLATE(BackReference);
@@ -707,16 +715,16 @@ class RegExpBackReference final : public RegExpTree {
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
// recursion, we give up. Ignorance is bliss.
int max_match() override { return kInfinity; }
- int index() const { return capture_->index(); }
- RegExpCapture* capture() const { return capture_; }
- void set_capture(RegExpCapture* capture) { capture_ = capture; }
+ const ZoneList<RegExpCapture*>* captures() const { return &captures_; }
+ void add_capture(RegExpCapture* capture, Zone* zone) {
+ captures_.Add(capture, zone);
+ }
const ZoneVector<base::uc16>* name() const { return name_; }
void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
private:
- RegExpCapture* capture_ = nullptr;
+ ZoneList<RegExpCapture*> captures_;
const ZoneVector<base::uc16>* name_ = nullptr;
- const RegExpFlags flags_;
};
diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.cc b/js/src/irregexp/imported/regexp-bytecode-generator.cc
index c83e10a598..251ed1cda5 100644
--- a/js/src/irregexp/imported/regexp-bytecode-generator.cc
+++ b/js/src/irregexp/imported/regexp-bytecode-generator.cc
@@ -383,7 +383,7 @@ Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
isolate_, zone(), source, buffer_.data(), length(), jump_edges_);
} else {
array = isolate_->factory()->NewByteArray(length());
- Copy(array->GetDataStartAddress());
+ Copy(array->begin());
}
return array;
diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.cc b/js/src/irregexp/imported/regexp-bytecode-peephole.cc
index ec8dcf1108..0ef0bab702 100644
--- a/js/src/irregexp/imported/regexp-bytecode-peephole.cc
+++ b/js/src/irregexp/imported/regexp-bytecode-peephole.cc
@@ -1012,13 +1012,13 @@ Handle<ByteArray> RegExpBytecodePeepholeOptimization::OptimizeBytecode(
RegExpBytecodePeephole peephole(zone, length, jump_edges);
bool did_optimize = peephole.OptimizeBytecode(bytecode, length);
Handle<ByteArray> array = isolate->factory()->NewByteArray(peephole.Length());
- peephole.CopyOptimizedBytecode(array->GetDataStartAddress());
+ peephole.CopyOptimizedBytecode(array->begin());
if (did_optimize && v8_flags.trace_regexp_peephole_optimization) {
PrintF("Original Bytecode:\n");
RegExpBytecodeDisassemble(bytecode, length, source->ToCString().get());
PrintF("Optimized Bytecode:\n");
- RegExpBytecodeDisassemble(array->GetDataStartAddress(), peephole.Length(),
+ RegExpBytecodeDisassemble(array->begin(), peephole.Length(),
source->ToCString().get());
}
diff --git a/js/src/irregexp/imported/regexp-compiler-tonode.cc b/js/src/irregexp/imported/regexp-compiler-tonode.cc
index f5087bdb08..b1340123d8 100644
--- a/js/src/irregexp/imported/regexp-compiler-tonode.cc
+++ b/js/src/irregexp/imported/regexp-compiler-tonode.cc
@@ -3,7 +3,6 @@
// found in the LICENSE file.
#include "irregexp/imported/regexp-compiler.h"
-
#include "irregexp/imported/regexp.h"
#ifdef V8_INTL_SUPPORT
@@ -418,27 +417,6 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
} // namespace
-#ifdef V8_INTL_SUPPORT
-// static
-void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {
- // Remove characters for which closeOver() adds full-case-folding equivalents
- // because we should work only with simple case folding mappings.
- icu::UnicodeSet non_simple = icu::UnicodeSet(set);
- non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());
- set.removeAll(non_simple);
-
- set.closeOver(USET_CASE_INSENSITIVE);
- // Full case folding maps single characters to multiple characters.
- // Those are represented as strings in the set. Remove them so that
- // we end up with only simple and common case mappings.
- set.removeAllStrings();
-
- // Add characters that have non-simple case foldings again (they match
- // themselves).
- set.addAll(non_simple);
-}
-#endif // V8_INTL_SUPPORT
-
// static
void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
Zone* zone) {
@@ -460,8 +438,7 @@ void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
}
// Clear the ranges list without freeing the backing store.
ranges->Rewind(0);
-
- UnicodeSimpleCloseOver(set);
+ set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
for (int i = 0; i < set.getRangeCount(); i++) {
ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
}
@@ -476,7 +453,9 @@ RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,
Zone* const zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone);
- if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
+ const bool needs_case_folding =
+ NeedsUnicodeCaseEquivalents(compiler->flags()) && !is_case_folded();
+ if (needs_case_folding) {
CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);
}
@@ -487,8 +466,7 @@ RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,
if (is_negated()) {
// With /v, character classes are never negated.
- // TODO(v8:11935): Change permalink once proposal is in stage 4.
- // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom
+ // https://tc39.es/ecma262/#sec-compileatom
// Atom :: CharacterClass
// 4. Assert: cc.[[Invert]] is false.
// Instead the complement is created when evaluating the class set.
@@ -561,7 +539,12 @@ RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,
}
}
if (!ranges()->is_empty()) {
- alternatives->Add(zone->template New<RegExpClassRanges>(zone, ranges()),
+ // In unicode sets mode case folding has to be done at precise locations
+ // (e.g. before building complements).
+ // It is therefore the parsers responsibility to case fold (sub-) ranges
+ // before creating ClassSetOperands.
+ alternatives->Add(zone->template New<RegExpClassRanges>(
+ zone, ranges(), RegExpClassRanges::IS_CASE_FOLDED),
zone);
}
if (empty_string != nullptr) {
@@ -1034,9 +1017,8 @@ namespace {
// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
RegExpNode* on_success,
- RegExpAssertion::Type type,
- RegExpFlags flags) {
- CHECK(NeedsUnicodeCaseEquivalents(flags));
+ RegExpAssertion::Type type) {
+ CHECK(NeedsUnicodeCaseEquivalents(compiler->flags()));
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* word_range =
zone->New<ZoneList<CharacterRange>>(2, zone);
@@ -1080,14 +1062,13 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
return AssertionNode::AtStart(on_success);
case Type::BOUNDARY:
return NeedsUnicodeCaseEquivalents(compiler->flags())
- ? BoundaryAssertionAsLookaround(
- compiler, on_success, Type::BOUNDARY, compiler->flags())
+ ? BoundaryAssertionAsLookaround(compiler, on_success,
+ Type::BOUNDARY)
: AssertionNode::AtBoundary(on_success);
case Type::NON_BOUNDARY:
return NeedsUnicodeCaseEquivalents(compiler->flags())
? BoundaryAssertionAsLookaround(compiler, on_success,
- Type::NON_BOUNDARY,
- compiler->flags())
+ Type::NON_BOUNDARY)
: AssertionNode::AtNonBoundary(on_success);
case Type::END_OF_INPUT:
return AssertionNode::AtEnd(on_success);
@@ -1130,10 +1111,17 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- return compiler->zone()->New<BackReferenceNode>(
- RegExpCapture::StartRegister(index()),
- RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
- on_success);
+ RegExpNode* backref_node = on_success;
+ // Only one of the captures in the list can actually match. Since
+ // back-references to unmatched captures are treated as empty, we can simply
+ // create back-references to all possible captures.
+ for (auto capture : *captures()) {
+ backref_node = compiler->zone()->New<BackReferenceNode>(
+ RegExpCapture::StartRegister(capture->index()),
+ RegExpCapture::EndRegister(capture->index()), compiler->read_backward(),
+ backref_node);
+ }
+ return backref_node;
}
RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
@@ -1141,9 +1129,40 @@ RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
return on_success;
}
+namespace {
+
+class V8_NODISCARD ModifiersScope {
+ public:
+ ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags)
+ : compiler_(compiler), previous_flags_(compiler->flags()) {
+ compiler->set_flags(flags);
+ }
+ ~ModifiersScope() { compiler_->set_flags(previous_flags_); }
+
+ private:
+ RegExpCompiler* compiler_;
+ const RegExpFlags previous_flags_;
+};
+
+} // namespace
+
RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- return body_->ToNode(compiler, on_success);
+ // If no flags are modified, simply convert and return the body.
+ if (flags() == compiler->flags()) {
+ return body_->ToNode(compiler, on_success);
+ }
+ // Reset flags for successor node.
+ const RegExpFlags old_flags = compiler->flags();
+ on_success = ActionNode::ModifyFlags(old_flags, on_success);
+
+ // Convert body using modifier.
+ ModifiersScope modifiers_scope(compiler, flags());
+ RegExpNode* body = body_->ToNode(compiler, on_success);
+
+ // Wrap body into modifier node.
+ RegExpNode* modified_body = ActionNode::ModifyFlags(flags(), body);
+ return modified_body;
}
RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
diff --git a/js/src/irregexp/imported/regexp-compiler.cc b/js/src/irregexp/imported/regexp-compiler.cc
index 514975d8ed..73dfe1d2ad 100644
--- a/js/src/irregexp/imported/regexp-compiler.cc
+++ b/js/src/irregexp/imported/regexp-compiler.cc
@@ -707,6 +707,13 @@ ActionNode* ActionNode::EmptyMatchCheck(int start_register,
return result;
}
+ActionNode* ActionNode::ModifyFlags(RegExpFlags flags, RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(MODIFY_FLAGS, on_success);
+ result->data_.u_modify_flags.flags = flags;
+ return result;
+}
+
#define DEFINE_ACCEPT(Type) \
void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
@@ -1377,6 +1384,9 @@ void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
filled_in, not_at_start);
} else {
+ if (action_type() == MODIFY_FLAGS) {
+ compiler->set_flags(flags());
+ }
on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
@@ -2867,7 +2877,7 @@ int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
const int kSkipArrayEntry = 0;
const int kDontSkipArrayEntry = 1;
- std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
+ std::memset(boolean_skip_table->begin(), kSkipArrayEntry,
boolean_skip_table->length());
for (int i = max_lookahead; i >= min_lookahead; i--) {
@@ -3454,6 +3464,11 @@ void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
assembler->Backtrack();
return;
}
+ case MODIFY_FLAGS: {
+ compiler->set_flags(flags());
+ on_success()->Emit(compiler, trace);
+ break;
+ }
default:
UNREACHABLE();
}
@@ -3473,8 +3488,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
RecursionCheck rc(compiler);
DCHECK_EQ(start_reg_ + 1, end_reg_);
- if (IsIgnoreCase(flags_)) {
- bool unicode = IsEitherUnicode(flags_);
+ if (IsIgnoreCase(compiler->flags())) {
+ bool unicode = IsEitherUnicode(compiler->flags());
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
unicode, trace->backtrack());
} else {
@@ -3485,7 +3500,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
// Check that the back reference does not end inside a surrogate pair.
- if (IsEitherUnicode(flags_) && !compiler->one_byte()) {
+ if (IsEitherUnicode(compiler->flags()) && !compiler->one_byte()) {
assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
}
on_success()->Emit(compiler, trace);
@@ -3707,7 +3722,7 @@ class Analysis : public NodeVisitor {
} while (false)
void VisitText(TextNode* that) override {
- that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
+ that->MakeCaseIndependent(isolate(), is_one_byte_, flags());
EnsureAnalyzed(that->on_success());
if (has_failed()) return;
that->CalculateOffsets();
@@ -3715,6 +3730,9 @@ class Analysis : public NodeVisitor {
}
void VisitAction(ActionNode* that) override {
+ if (that->action_type() == ActionNode::MODIFY_FLAGS) {
+ set_flags(that->flags());
+ }
EnsureAnalyzed(that->on_success());
if (has_failed()) return;
STATIC_FOR_EACH(Propagators::VisitAction(that));
@@ -3773,9 +3791,12 @@ class Analysis : public NodeVisitor {
#undef STATIC_FOR_EACH
private:
+ RegExpFlags flags() const { return flags_; }
+ void set_flags(RegExpFlags flags) { flags_ = flags; }
+
Isolate* isolate_;
const bool is_one_byte_;
- const RegExpFlags flags_;
+ RegExpFlags flags_;
RegExpError error_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
@@ -3903,13 +3924,12 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
}
RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
- RegExpFlags flags,
bool is_one_byte) {
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body =
RegExpCapture::ToNode(data->tree, 0, this, accept());
RegExpNode* node = captured_body;
- if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
+ if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags())) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
RegExpNode* loop_node = RegExpQuantifier::ToNode(
@@ -3931,13 +3951,14 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
}
}
if (is_one_byte) {
- node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
+ node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags());
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
- node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
+ node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags());
}
- } else if (IsEitherUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
+ } else if (IsEitherUnicode(flags()) &&
+ (IsGlobal(flags()) || IsSticky(flags()))) {
node = OptionallyStepBackToLeadSurrogate(node);
}
diff --git a/js/src/irregexp/imported/regexp-compiler.h b/js/src/irregexp/imported/regexp-compiler.h
index 91dd43ab8a..7a369430bb 100644
--- a/js/src/irregexp/imported/regexp-compiler.h
+++ b/js/src/irregexp/imported/regexp-compiler.h
@@ -501,8 +501,7 @@ class RegExpCompiler {
// - Inserting the implicit .* before/after the regexp if necessary.
// - If the input is a one-byte string, filtering out nodes that can't match.
// - Fixing up regexp matches that start within a surrogate pair.
- RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags,
- bool is_one_byte);
+ RegExpNode* PreprocessRegExp(RegExpCompileData* data, bool is_one_byte);
// If the regexp matching starts within a surrogate pair, step back to the
// lead surrogate and start matching from there.
@@ -527,7 +526,8 @@ class RegExpCompiler {
inline void IncrementRecursionDepth() { recursion_depth_++; }
inline void DecrementRecursionDepth() { recursion_depth_--; }
- RegExpFlags flags() const { return flags_; }
+ inline RegExpFlags flags() const { return flags_; }
+ inline void set_flags(RegExpFlags flags) { flags_ = flags; }
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
@@ -571,7 +571,7 @@ class RegExpCompiler {
int unicode_lookaround_position_register_;
ZoneVector<RegExpNode*>* work_list_;
int recursion_depth_;
- const RegExpFlags flags_;
+ RegExpFlags flags_;
RegExpMacroAssembler* macro_assembler_;
bool one_byte_;
bool reg_exp_too_big_;
diff --git a/js/src/irregexp/imported/regexp-dotprinter.cc b/js/src/irregexp/imported/regexp-dotprinter.cc
index 6746992a0a..cd0ca5dea8 100644
--- a/js/src/irregexp/imported/regexp-dotprinter.cc
+++ b/js/src/irregexp/imported/regexp-dotprinter.cc
@@ -231,6 +231,10 @@ void DotPrinterImpl::VisitAction(ActionNode* that) {
<< "\", shape=septagon";
break;
}
+ case ActionNode::MODIFY_FLAGS: {
+ os_ << "label=\"flags $" << that->flags() << "\", shape=septagon";
+ break;
+ }
}
os_ << "];\n";
PrintAttributes(that);
diff --git a/js/src/irregexp/imported/regexp-interpreter.cc b/js/src/irregexp/imported/regexp-interpreter.cc
index 43c8a4a5a4..2de1b12968 100644
--- a/js/src/irregexp/imported/regexp-interpreter.cc
+++ b/js/src/irregexp/imported/regexp-interpreter.cc
@@ -88,8 +88,7 @@ int32_t Load32Aligned(const uint8_t* pc) {
return *reinterpret_cast<const int32_t*>(pc);
}
-// TODO(jgruber): Rename to Load16AlignedUnsigned.
-uint32_t Load16Aligned(const uint8_t* pc) {
+uint32_t Load16AlignedUnsigned(const uint8_t* pc) {
DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
return *reinterpret_cast<const uint16_t*>(pc);
}
@@ -221,17 +220,17 @@ IrregexpInterpreter::Result MaybeThrowStackOverflow(
template <typename Char>
void UpdateCodeAndSubjectReferences(
Isolate* isolate, Handle<ByteArray> code_array,
- Handle<String> subject_string, ByteArray* code_array_out,
+ Handle<String> subject_string, Tagged<ByteArray>* code_array_out,
const uint8_t** code_base_out, const uint8_t** pc_out,
- String* subject_string_out,
+ Tagged<String>* subject_string_out,
base::Vector<const Char>* subject_string_vector_out) {
DisallowGarbageCollection no_gc;
- if (*code_base_out != code_array->GetDataStartAddress()) {
+ if (*code_base_out != code_array->begin()) {
*code_array_out = *code_array;
const intptr_t pc_offset = *pc_out - *code_base_out;
DCHECK_GT(pc_offset, 0);
- *code_base_out = code_array->GetDataStartAddress();
+ *code_base_out = code_array->begin();
*pc_out = *code_base_out + pc_offset;
}
@@ -244,8 +243,9 @@ void UpdateCodeAndSubjectReferences(
// necessary.
template <typename Char>
IrregexpInterpreter::Result HandleInterrupts(
- Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out,
- String* subject_string_out, const uint8_t** code_base_out,
+ Isolate* isolate, RegExp::CallOrigin call_origin,
+ Tagged<ByteArray>* code_array_out, Tagged<String>* subject_string_out,
+ const uint8_t** code_base_out,
base::Vector<const Char>* subject_string_vector_out,
const uint8_t** pc_out) {
DisallowGarbageCollection no_gc;
@@ -276,12 +276,12 @@ IrregexpInterpreter::Result HandleInterrupts(
} else if (check.InterruptRequested()) {
const bool was_one_byte =
String::IsOneByteRepresentationUnderneath(*subject_string_out);
- Object result;
+ Tagged<Object> result;
{
AllowGarbageCollection yes_gc;
result = isolate->stack_guard()->HandleInterrupts();
}
- if (result.IsException(isolate)) {
+ if (IsException(result, isolate)) {
return IrregexpInterpreter::EXCEPTION;
}
@@ -375,10 +375,10 @@ bool IndexIsInBounds(int index, int length) {
template <typename Char>
IrregexpInterpreter::Result RawMatch(
- Isolate* isolate, ByteArray code_array, String subject_string,
- base::Vector<const Char> subject, int* output_registers,
- int output_register_count, int total_register_count, int current,
- uint32_t current_char, RegExp::CallOrigin call_origin,
+ Isolate* isolate, Tagged<ByteArray> code_array,
+ Tagged<String> subject_string, base::Vector<const Char> subject,
+ int* output_registers, int output_register_count, int total_register_count,
+ int current, uint32_t current_char, RegExp::CallOrigin call_origin,
const uint32_t backtrack_limit) {
DisallowGarbageCollection no_gc;
@@ -430,7 +430,7 @@ IrregexpInterpreter::Result RawMatch(
#endif // V8_USE_COMPUTED_GOTO
- const uint8_t* pc = code_array.GetDataStartAddress();
+ const uint8_t* pc = code_array->begin();
const uint8_t* code_base = pc;
InterpreterRegisters registers(total_register_count, output_registers,
@@ -702,8 +702,8 @@ IrregexpInterpreter::Result RawMatch(
}
BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
uint32_t c = LoadPacked24Unsigned(insn);
- uint32_t minus = Load16Aligned(pc + 4);
- uint32_t mask = Load16Aligned(pc + 6);
+ uint32_t minus = Load16AlignedUnsigned(pc + 4);
+ uint32_t mask = Load16AlignedUnsigned(pc + 6);
if (c != ((current_char - minus) & mask)) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
} else {
@@ -712,8 +712,8 @@ IrregexpInterpreter::Result RawMatch(
DISPATCH();
}
BYTECODE(CHECK_CHAR_IN_RANGE) {
- uint32_t from = Load16Aligned(pc + 4);
- uint32_t to = Load16Aligned(pc + 6);
+ uint32_t from = Load16AlignedUnsigned(pc + 4);
+ uint32_t to = Load16AlignedUnsigned(pc + 6);
if (from <= current_char && current_char <= to) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
} else {
@@ -722,8 +722,8 @@ IrregexpInterpreter::Result RawMatch(
DISPATCH();
}
BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
- uint32_t from = Load16Aligned(pc + 4);
- uint32_t to = Load16Aligned(pc + 6);
+ uint32_t from = Load16AlignedUnsigned(pc + 4);
+ uint32_t to = Load16AlignedUnsigned(pc + 6);
if (from > current_char || current_char > to) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
} else {
@@ -914,7 +914,7 @@ IrregexpInterpreter::Result RawMatch(
BYTECODE(SKIP_UNTIL_CHAR) {
int32_t load_offset = LoadPacked24Signed(insn);
int32_t advance = Load16AlignedSigned(pc + 4);
- uint32_t c = Load16Aligned(pc + 6);
+ uint32_t c = Load16AlignedUnsigned(pc + 6);
while (IndexIsInBounds(current + load_offset, subject.length())) {
current_char = subject[current + load_offset];
if (c == current_char) {
@@ -929,7 +929,7 @@ IrregexpInterpreter::Result RawMatch(
BYTECODE(SKIP_UNTIL_CHAR_AND) {
int32_t load_offset = LoadPacked24Signed(insn);
int32_t advance = Load16AlignedSigned(pc + 4);
- uint16_t c = Load16Aligned(pc + 6);
+ uint16_t c = Load16AlignedUnsigned(pc + 6);
uint32_t mask = Load32Aligned(pc + 8);
int32_t maximum_offset = Load32Aligned(pc + 12);
while (static_cast<uintptr_t>(current + maximum_offset) <=
@@ -947,7 +947,7 @@ IrregexpInterpreter::Result RawMatch(
BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) {
int32_t load_offset = LoadPacked24Signed(insn);
int32_t advance = Load16AlignedSigned(pc + 4);
- uint16_t c = Load16Aligned(pc + 6);
+ uint16_t c = Load16AlignedUnsigned(pc + 6);
int32_t maximum_offset = Load32Aligned(pc + 8);
while (static_cast<uintptr_t>(current + maximum_offset) <=
static_cast<uintptr_t>(subject.length())) {
@@ -979,7 +979,7 @@ IrregexpInterpreter::Result RawMatch(
BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) {
int32_t load_offset = LoadPacked24Signed(insn);
int32_t advance = Load16AlignedSigned(pc + 4);
- uint16_t limit = Load16Aligned(pc + 6);
+ uint16_t limit = Load16AlignedUnsigned(pc + 6);
const uint8_t* table = pc + 8;
while (IndexIsInBounds(current + load_offset, subject.length())) {
current_char = subject[current + load_offset];
@@ -999,8 +999,8 @@ IrregexpInterpreter::Result RawMatch(
BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) {
int32_t load_offset = LoadPacked24Signed(insn);
int32_t advance = Load32Aligned(pc + 4);
- uint16_t c = Load16Aligned(pc + 8);
- uint16_t c2 = Load16Aligned(pc + 10);
+ uint16_t c = Load16AlignedUnsigned(pc + 8);
+ uint16_t c2 = Load16AlignedUnsigned(pc + 10);
while (IndexIsInBounds(current + load_offset, subject.length())) {
current_char = subject[current + load_offset];
// The two if-statements below are split up intentionally, as combining
@@ -1047,29 +1047,29 @@ IrregexpInterpreter::Result RawMatch(
// static
IrregexpInterpreter::Result IrregexpInterpreter::Match(
- Isolate* isolate, JSRegExp regexp, String subject_string,
+ Isolate* isolate, Tagged<JSRegExp> regexp, Tagged<String> subject_string,
int* output_registers, int output_register_count, int start_position,
RegExp::CallOrigin call_origin) {
- if (v8_flags.regexp_tier_up) regexp.TierUpTick();
+ if (v8_flags.regexp_tier_up) regexp->TierUpTick();
bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string);
- ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte));
- int total_register_count = regexp.max_register_count();
+ Tagged<ByteArray> code_array = ByteArray::cast(regexp->bytecode(is_one_byte));
+ int total_register_count = regexp->max_register_count();
return MatchInternal(isolate, code_array, subject_string, output_registers,
output_register_count, total_register_count,
- start_position, call_origin, regexp.backtrack_limit());
+ start_position, call_origin, regexp->backtrack_limit());
}
IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
- Isolate* isolate, ByteArray code_array, String subject_string,
- int* output_registers, int output_register_count, int total_register_count,
- int start_position, RegExp::CallOrigin call_origin,
- uint32_t backtrack_limit) {
- DCHECK(subject_string.IsFlat());
+ Isolate* isolate, Tagged<ByteArray> code_array,
+ Tagged<String> subject_string, int* output_registers,
+ int output_register_count, int total_register_count, int start_position,
+ RegExp::CallOrigin call_origin, uint32_t backtrack_limit) {
+ DCHECK(subject_string->IsFlat());
// TODO(chromium:1262676): Remove this CHECK once fixed.
- CHECK(code_array.IsByteArray());
+ CHECK(IsByteArray(code_array));
// Note: Heap allocation *is* allowed in two situations if calling from
// Runtime:
@@ -1080,7 +1080,7 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
DisallowGarbageCollection no_gc;
base::uc16 previous_char = '\n';
- String::FlatContent subject_content = subject_string.GetFlatContent(no_gc);
+ String::FlatContent subject_content = subject_string->GetFlatContent(no_gc);
// Because interrupts can result in GC and string content relocation, the
// checksum verification in FlatContent may fail even though this code is
// safe. See (2) above.
@@ -1122,10 +1122,10 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
DisallowHandleAllocation no_handles;
DisallowHandleDereference no_deref;
- String subject_string = String::cast(Object(subject));
- JSRegExp regexp_obj = JSRegExp::cast(Object(regexp));
+ Tagged<String> subject_string = String::cast(Tagged<Object>(subject));
+ Tagged<JSRegExp> regexp_obj = JSRegExp::cast(Tagged<Object>(regexp));
- if (regexp_obj.MarkedForTierUp()) {
+ if (regexp_obj->MarkedForTierUp()) {
// Returning RETRY will re-enter through runtime, where actual recompilation
// for tier-up takes place.
return IrregexpInterpreter::RETRY;
diff --git a/js/src/irregexp/imported/regexp-interpreter.h b/js/src/irregexp/imported/regexp-interpreter.h
index bc55be2b8c..825916291f 100644
--- a/js/src/irregexp/imported/regexp-interpreter.h
+++ b/js/src/irregexp/imported/regexp-interpreter.h
@@ -49,17 +49,18 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
RegExp::CallOrigin call_origin,
Isolate* isolate, Address regexp);
- static Result MatchInternal(Isolate* isolate, ByteArray code_array,
- String subject_string, int* output_registers,
- int output_register_count,
+ static Result MatchInternal(Isolate* isolate, Tagged<ByteArray> code_array,
+ Tagged<String> subject_string,
+ int* output_registers, int output_register_count,
int total_register_count, int start_position,
RegExp::CallOrigin call_origin,
uint32_t backtrack_limit);
private:
- static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string,
- int* output_registers, int output_register_count,
- int start_position, RegExp::CallOrigin call_origin);
+ static Result Match(Isolate* isolate, Tagged<JSRegExp> regexp,
+ Tagged<String> subject_string, int* output_registers,
+ int output_register_count, int start_position,
+ RegExp::CallOrigin call_origin);
};
} // namespace internal
diff --git a/js/src/irregexp/imported/regexp-macro-assembler.cc b/js/src/irregexp/imported/regexp-macro-assembler.cc
index b4d99bf775..b99c08424e 100644
--- a/js/src/irregexp/imported/regexp-macro-assembler.cc
+++ b/js/src/irregexp/imported/regexp-macro-assembler.cc
@@ -182,24 +182,25 @@ uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char,
static constexpr uint32_t kTrue = 1;
static constexpr uint32_t kFalse = 0;
- FixedUInt16Array ranges = FixedUInt16Array::cast(Object(raw_byte_array));
- DCHECK_GE(ranges.length(), 1);
+ Tagged<FixedUInt16Array> ranges =
+ FixedUInt16Array::cast(Tagged<Object>(raw_byte_array));
+ DCHECK_GE(ranges->length(), 1);
// Shortcut for fully out of range chars.
- if (current_char < ranges.get(0)) return kFalse;
- if (current_char >= ranges.get(ranges.length() - 1)) {
+ if (current_char < ranges->get(0)) return kFalse;
+ if (current_char >= ranges->get(ranges->length() - 1)) {
// The last range may be open-ended.
- return (ranges.length() % 2) == 0 ? kFalse : kTrue;
+ return (ranges->length() % 2) == 0 ? kFalse : kTrue;
}
// Binary search for the matching range. `ranges` is encoded as
// [from0, to0, from1, to1, ..., fromN, toN], or
// [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
- int mid, lower = 0, upper = ranges.length();
+ int mid, lower = 0, upper = ranges->length();
do {
mid = lower + (upper - lower) / 2;
- const base::uc16 elem = ranges.get(mid);
+ const base::uc16 elem = ranges->get(mid);
if (current_char < elem) {
upper = mid;
} else if (current_char > elem) {
@@ -210,7 +211,7 @@ uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char,
}
} while (lower < upper);
- const bool current_char_ge_last_elem = current_char >= ranges.get(mid);
+ const bool current_char_ge_last_elem = current_char >= ranges->get(mid);
const int current_range_start_index =
current_char_ge_last_elem ? mid : mid - 1;
@@ -277,15 +278,16 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() const {
// static
int NativeRegExpMacroAssembler::CheckStackGuardState(
Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
- Address* return_address, InstructionStream re_code, Address* subject,
- const uint8_t** input_start, const uint8_t** input_end) {
+ Address* return_address, Tagged<InstructionStream> re_code,
+ Address* subject, const uint8_t** input_start, const uint8_t** input_end,
+ uintptr_t gap) {
DisallowGarbageCollection no_gc;
Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
- DCHECK_LE(re_code.instruction_start(), old_pc);
- DCHECK_LE(old_pc, re_code.code(kAcquireLoad).instruction_end());
+ DCHECK_LE(re_code->instruction_start(), old_pc);
+ DCHECK_LE(old_pc, re_code->code(kAcquireLoad)->instruction_end());
StackLimitCheck check(isolate);
- bool js_has_overflowed = check.JsHasOverflowed();
+ bool js_has_overflowed = check.JsHasOverflowed(gap);
if (call_origin == RegExp::CallOrigin::kFromJs) {
// Direct calls from JavaScript can be interrupted in two ways:
@@ -310,7 +312,8 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
// Prepare for possible GC.
HandleScope handles(isolate);
Handle<InstructionStream> code_handle(re_code, isolate);
- Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
+ Handle<String> subject_handle(String::cast(Tagged<Object>(*subject)),
+ isolate);
bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
int return_value = 0;
@@ -322,8 +325,8 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
return_value = EXCEPTION;
} else if (check.InterruptRequested()) {
AllowGarbageCollection yes_gc;
- Object result = isolate->stack_guard()->HandleInterrupts();
- if (result.IsException(isolate)) return_value = EXCEPTION;
+ Tagged<Object> result = isolate->stack_guard()->HandleInterrupts();
+ if (IsException(result, isolate)) return_value = EXCEPTION;
}
// We are not using operator == here because it does a slow DCHECK
@@ -371,34 +374,34 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
// DisallowGarbageCollection, since regexps might be preempted, and another
// thread might do allocation anyway.
- String subject_ptr = *subject;
+ Tagged<String> subject_ptr = *subject;
// Character offsets into string.
int start_offset = previous_index;
- int char_length = subject_ptr.length() - start_offset;
+ int char_length = subject_ptr->length() - start_offset;
int slice_offset = 0;
// The string has been flattened, so if it is a cons string it contains the
// full string in the first part.
if (StringShape(subject_ptr).IsCons()) {
- DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
- subject_ptr = ConsString::cast(subject_ptr).first();
+ DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
+ subject_ptr = ConsString::cast(subject_ptr)->first();
} else if (StringShape(subject_ptr).IsSliced()) {
- SlicedString slice = SlicedString::cast(subject_ptr);
- subject_ptr = slice.parent();
- slice_offset = slice.offset();
+ Tagged<SlicedString> slice = SlicedString::cast(subject_ptr);
+ subject_ptr = slice->parent();
+ slice_offset = slice->offset();
}
if (StringShape(subject_ptr).IsThin()) {
- subject_ptr = ThinString::cast(subject_ptr).actual();
+ subject_ptr = ThinString::cast(subject_ptr)->actual();
}
// Ensure that an underlying string has the same representation.
- bool is_one_byte = subject_ptr.IsOneByteRepresentation();
- DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
+ bool is_one_byte = subject_ptr->IsOneByteRepresentation();
+ DCHECK(IsExternalString(subject_ptr) || IsSeqString(subject_ptr));
// String is now either Sequential or External
int char_size_shift = is_one_byte ? 0 : 1;
DisallowGarbageCollection no_gc;
const uint8_t* input_start =
- subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
+ subject_ptr->AddressOfCharacterAt(start_offset + slice_offset, no_gc);
int byte_length = char_length << char_size_shift;
const uint8_t* input_end = input_start + byte_length;
return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
@@ -407,9 +410,9 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
// static
int NativeRegExpMacroAssembler::ExecuteForTesting(
- String input, int start_offset, const uint8_t* input_start,
+ Tagged<String> input, int start_offset, const uint8_t* input_start,
const uint8_t* input_end, int* output, int output_size, Isolate* isolate,
- JSRegExp regexp) {
+ Tagged<JSRegExp> regexp) {
return Execute(input, start_offset, input_start, input_end, output,
output_size, isolate, regexp);
}
@@ -419,13 +422,14 @@ int NativeRegExpMacroAssembler::ExecuteForTesting(
// the signature of the interpreter. We should get rid of JS objects passed to
// internal methods.
int NativeRegExpMacroAssembler::Execute(
- String input, // This needs to be the unpacked (sliced, cons) string.
+ Tagged<String>
+ input, // This needs to be the unpacked (sliced, cons) string.
int start_offset, const uint8_t* input_start, const uint8_t* input_end,
- int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
+ int* output, int output_size, Isolate* isolate, Tagged<JSRegExp> regexp) {
RegExpStackScope stack_scope(isolate);
bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
- Code code = Code::cast(regexp.code(is_one_byte));
+ Tagged<Code> code = Code::cast(regexp->code(isolate, is_one_byte));
RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
using RegexpMatcherSig =
@@ -439,7 +443,7 @@ int NativeRegExpMacroAssembler::Execute(
output, output_size, call_origin, isolate, regexp.ptr());
DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
- if (result == EXCEPTION && !isolate->has_pending_exception()) {
+ if (result == EXCEPTION && !isolate->has_exception()) {
// We detected a stack overflow (on the backtrack stack) in RegExp code,
// but haven't created the exception yet. Additionally, we allow heap
// allocation because even though it invalidates {input_start} and
diff --git a/js/src/irregexp/imported/regexp-macro-assembler.h b/js/src/irregexp/imported/regexp-macro-assembler.h
index af7e4f5297..6863adbaff 100644
--- a/js/src/irregexp/imported/regexp-macro-assembler.h
+++ b/js/src/irregexp/imported/regexp-macro-assembler.h
@@ -301,12 +301,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
int* offsets_vector, int offsets_vector_length,
int previous_index, Isolate* isolate);
- V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset,
- const uint8_t* input_start,
- const uint8_t* input_end,
- int* output, int output_size,
- Isolate* isolate,
- JSRegExp regexp);
+ V8_EXPORT_PRIVATE static int ExecuteForTesting(
+ Tagged<String> input, int start_offset, const uint8_t* input_start,
+ const uint8_t* input_end, int* output, int output_size, Isolate* isolate,
+ Tagged<JSRegExp> regexp);
bool CanReadUnaligned() const override;
@@ -330,9 +328,9 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
static int CheckStackGuardState(Isolate* isolate, int start_index,
RegExp::CallOrigin call_origin,
Address* return_address,
- InstructionStream re_code, Address* subject,
- const uint8_t** input_start,
- const uint8_t** input_end);
+ Tagged<InstructionStream> re_code,
+ Address* subject, const uint8_t** input_start,
+ const uint8_t** input_end, uintptr_t gap);
static Address word_character_map_address() {
return reinterpret_cast<Address>(&word_character_map[0]);
@@ -348,9 +346,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
private:
// Returns a {Result} sentinel, or the number of successful matches.
- static int Execute(String input, int start_offset, const uint8_t* input_start,
- const uint8_t* input_end, int* output, int output_size,
- Isolate* isolate, JSRegExp regexp);
+ static int Execute(Tagged<String> input, int start_offset,
+ const uint8_t* input_start, const uint8_t* input_end,
+ int* output, int output_size, Isolate* isolate,
+ Tagged<JSRegExp> regexp);
ZoneUnorderedMap<uint32_t, Handle<FixedUInt16Array>> range_array_cache_;
};
diff --git a/js/src/irregexp/imported/regexp-nodes.h b/js/src/irregexp/imported/regexp-nodes.h
index 9407f1c5ec..f3d7e6c58f 100644
--- a/js/src/irregexp/imported/regexp-nodes.h
+++ b/js/src/irregexp/imported/regexp-nodes.h
@@ -318,7 +318,8 @@ class ActionNode : public SeqRegExpNode {
BEGIN_NEGATIVE_SUBMATCH,
POSITIVE_SUBMATCH_SUCCESS,
EMPTY_MATCH_CHECK,
- CLEAR_CAPTURES
+ CLEAR_CAPTURES,
+ MODIFY_FLAGS
};
static ActionNode* SetRegisterForLoop(int reg, int val,
RegExpNode* on_success);
@@ -341,6 +342,7 @@ class ActionNode : public SeqRegExpNode {
int repetition_register,
int repetition_limit,
RegExpNode* on_success);
+ static ActionNode* ModifyFlags(RegExpFlags flags, RegExpNode* on_success);
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
@@ -353,6 +355,10 @@ class ActionNode : public SeqRegExpNode {
int GreedyLoopTextLength() override {
return kNodeIsTooComplexForGreedyLoops;
}
+ RegExpFlags flags() {
+ DCHECK_EQ(action_type(), MODIFY_FLAGS);
+ return RegExpFlags{data_.u_modify_flags.flags};
+ }
private:
union {
@@ -382,9 +388,13 @@ class ActionNode : public SeqRegExpNode {
int range_from;
int range_to;
} u_clear_captures;
+ struct {
+ int flags;
+ } u_modify_flags;
} data_;
ActionNode(ActionType action_type, RegExpNode* on_success)
: SeqRegExpNode(on_success), action_type_(action_type) {}
+
ActionType action_type_;
friend class DotPrinterImpl;
friend Zone;
@@ -499,12 +509,11 @@ class AssertionNode : public SeqRegExpNode {
class BackReferenceNode : public SeqRegExpNode {
public:
- BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags,
- bool read_backward, RegExpNode* on_success)
+ BackReferenceNode(int start_reg, int end_reg, bool read_backward,
+ RegExpNode* on_success)
: SeqRegExpNode(on_success),
start_reg_(start_reg),
end_reg_(end_reg),
- flags_(flags),
read_backward_(read_backward) {}
void Accept(NodeVisitor* visitor) override;
int start_register() { return start_reg_; }
@@ -522,7 +531,6 @@ class BackReferenceNode : public SeqRegExpNode {
private:
int start_reg_;
int end_reg_;
- RegExpFlags flags_;
bool read_backward_;
};
diff --git a/js/src/irregexp/imported/regexp-parser.cc b/js/src/irregexp/imported/regexp-parser.cc
index ea2a6c6d7a..965fc567b7 100644
--- a/js/src/irregexp/imported/regexp-parser.cc
+++ b/js/src/irregexp/imported/regexp-parser.cc
@@ -13,7 +13,7 @@
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
#include "unicode/utf16.h" // For U16_NEXT
-#endif // V8_INTL_SUPPORT
+#endif // V8_INTL_SUPPORT
namespace v8 {
namespace internal {
@@ -67,8 +67,7 @@ class RegExpTextBuilder {
bool ignore_case() const { return IsIgnoreCase(flags_); }
bool IsUnicodeMode() const {
// Either /v or /u enable UnicodeMode
- // TODO(v8:11935): Change permalink once proposal is in stage 4.
- // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+ // https://tc39.es/ecma262/#sec-parsepattern
return IsUnicode(flags_) || IsUnicodeSets(flags_);
}
Zone* zone() const { return zone_; }
@@ -264,7 +263,7 @@ RegExpTree* RegExpTextBuilder::PopLastAtom() {
characters_ = nullptr;
atom = zone()->New<RegExpAtom>(char_vector);
return atom;
- } else if (text_.size() > 0) {
+ } else if (!text_.empty()) {
atom = text_.back();
text_.pop_back();
return atom;
@@ -315,8 +314,7 @@ class RegExpBuilder {
void FlushTerms();
bool IsUnicodeMode() const {
// Either /v or /u enable UnicodeMode
- // TODO(v8:11935): Change permalink once proposal is in stage 4.
- // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+ // https://tc39.es/ecma262/#sec-parsepattern
return IsUnicode(flags_) || IsUnicodeSets(flags_);
}
Zone* zone() const { return zone_; }
@@ -354,7 +352,12 @@ class RegExpParserState : public ZoneObject {
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index),
- capture_name_(capture_name) {}
+ capture_name_(capture_name) {
+ if (previous_state != nullptr) {
+ non_participating_capture_group_interval_ =
+ previous_state->non_participating_capture_group_interval();
+ }
+ }
// Parser state of containing expression, if any.
RegExpParserState* previous_state() const { return previous_state_; }
bool IsSubexpression() { return previous_state_ != nullptr; }
@@ -371,6 +374,9 @@ class RegExpParserState : public ZoneObject {
// The name of the current sub-expression, if group_type is CAPTURE. Only
// used for named captures.
const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
+ std::pair<int, int> non_participating_capture_group_interval() const {
+ return non_participating_capture_group_interval_;
+ }
bool IsNamedCapture() const { return capture_name_ != nullptr; }
@@ -398,6 +404,18 @@ class RegExpParserState : public ZoneObject {
return false;
}
+ void NewAlternative(int captures_started) {
+ if (non_participating_capture_group_interval().second != 0) {
+ // Extend the non-participating interval.
+ non_participating_capture_group_interval_.second = captures_started;
+ } else {
+ // Create new non-participating interval from the start of the current
+ // enclosing group to all captures created within that group so far.
+ non_participating_capture_group_interval_ =
+ std::make_pair(capture_index(), captures_started);
+ }
+ }
+
private:
// Linked list implementation of stack of states.
RegExpParserState* const previous_state_;
@@ -411,6 +429,11 @@ class RegExpParserState : public ZoneObject {
const int disjunction_capture_index_;
// Stored capture name (if any).
const ZoneVector<base::uc16>* const capture_name_;
+ // Interval of (named) capture indices ]from, to] that are not participating
+ // in the current state (i.e. they cannot match).
+ // Capture indices are not participating if they were created in a different
+ // alternative.
+ std::pair<int, int> non_participating_capture_group_interval_;
};
template <class CharT>
@@ -463,17 +486,22 @@ class RegExpParserImpl final {
RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
ClassSetOperandType* type_out,
ZoneList<CharacterRange>* ranges,
- CharacterClassStrings* strings);
+ CharacterClassStrings* strings,
+ base::uc32* character);
base::uc32 ParseClassSetCharacter();
// Parses and returns a single escaped character.
base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
bool* is_escaped_unicode_character);
+ void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges,
+ CharacterRange new_range);
+
RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,
RegExpTree* first_operand,
ClassSetOperandType first_operand_type,
ZoneList<CharacterRange>* ranges,
- CharacterClassStrings* strings);
+ CharacterClassStrings* strings,
+ base::uc32 first_character);
RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,
bool is_negated, RegExpTree* first_operand,
ClassSetOperandType first_operand_type);
@@ -504,11 +532,10 @@ class RegExpParserImpl final {
int captures_started() const { return captures_started_; }
int position() const { return next_pos_ - 1; }
bool failed() const { return failed_; }
- RegExpFlags flags() const { return top_level_flags_; }
+ RegExpFlags flags() const { return flags_; }
bool IsUnicodeMode() const {
// Either /v or /u enable UnicodeMode
- // TODO(v8:11935): Change permalink once proposal is in stage 4.
- // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+ // https://tc39.es/ecma262/#sec-parsepattern
return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;
}
bool unicode_sets() const { return IsUnicodeSets(flags()); }
@@ -528,7 +555,7 @@ class RegExpParserImpl final {
// Creates a new named capture at the specified index. Must be called exactly
// once for each named capture. Fails if a capture with the same name is
// encountered.
- bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
+ bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index);
// Parses the name of a capture group (?<name>pattern). The name must adhere
// to IdentifierName in the ECMAScript standard.
@@ -543,7 +570,7 @@ class RegExpParserImpl final {
// to avoid complicating cases in which references comes before the capture.
void PatchNamedBackReferences();
- ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
+ ZoneVector<RegExpCapture*>* GetNamedCaptures();
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
@@ -593,16 +620,20 @@ class RegExpParserImpl final {
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
- ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
+ // Maps capture names to a list of capture indices with this name.
+ ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>*
+ named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
+ ZoneList<CharacterRange>* temp_ranges_;
const CharT* const input_;
const int input_length_;
base::uc32 current_;
- const RegExpFlags top_level_flags_;
+ RegExpFlags flags_;
bool force_unicode_ = false; // Force parser to act as if unicode were set.
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.
+ int lookaround_count_; // Only valid after we have scanned for lookbehinds.
bool has_more_;
bool simple_;
bool contains_anchor_;
@@ -625,10 +656,11 @@ RegExpParserImpl<CharT>::RegExpParserImpl(
input_(input),
input_length_(input_length),
current_(kEndMarker),
- top_level_flags_(flags),
+ flags_(flags),
next_pos_(0),
captures_started_(0),
capture_count_(0),
+ lookaround_count_(0),
has_more_(true),
simple_(false),
contains_anchor_(false),
@@ -909,21 +941,21 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
// Build result of subexpression.
if (group_type == CAPTURE) {
if (state->IsNamedCapture()) {
- CreateNamedCaptureAtIndex(state->capture_name(),
- capture_index CHECK_FAILED);
+ CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED);
}
RegExpCapture* capture = GetCapture(capture_index);
capture->set_body(body);
body = capture;
} else if (group_type == GROUPING) {
- body = zone()->template New<RegExpGroup>(body);
+ body = zone()->template New<RegExpGroup>(body, builder->flags());
} else {
DCHECK(group_type == POSITIVE_LOOKAROUND ||
group_type == NEGATIVE_LOOKAROUND);
bool is_positive = (group_type == POSITIVE_LOOKAROUND);
body = zone()->template New<RegExpLookaround>(
body, is_positive, end_capture_index - capture_index,
- capture_index, state->lookaround_type());
+ capture_index, state->lookaround_type(), lookaround_count_);
+ lookaround_count_++;
}
// Restore previous state.
@@ -937,6 +969,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
}
case '|': {
Advance();
+ state->NewAlternative(captures_started());
builder->NewAlternative();
continue;
}
@@ -984,6 +1017,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
case '(': {
state = ParseOpenParenthesis(state CHECK_FAILED);
builder = state->builder();
+ flags_ = builder->flags();
continue;
}
case '[': {
@@ -1037,8 +1071,8 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
builder->AddEmpty();
} else {
RegExpCapture* capture = GetCapture(index);
- RegExpTree* atom = zone()->template New<RegExpBackReference>(
- capture, builder->flags());
+ RegExpTree* atom =
+ zone()->template New<RegExpBackReference>(capture, zone());
builder->AddAtom(atom);
}
break;
@@ -1246,43 +1280,91 @@ RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
bool is_named_capture = false;
const ZoneVector<base::uc16>* capture_name = nullptr;
SubexpressionType subexpr_type = CAPTURE;
+ RegExpFlags flags = state->builder()->flags();
+ bool parsing_modifiers = false;
+ bool modifiers_polarity = true;
+ RegExpFlags modifiers;
Advance();
if (current() == '?') {
- switch (Next()) {
- case ':':
- Advance(2);
- subexpr_type = GROUPING;
- break;
- case '=':
- Advance(2);
- lookaround_type = RegExpLookaround::LOOKAHEAD;
- subexpr_type = POSITIVE_LOOKAROUND;
- break;
- case '!':
- Advance(2);
- lookaround_type = RegExpLookaround::LOOKAHEAD;
- subexpr_type = NEGATIVE_LOOKAROUND;
- break;
- case '<':
- Advance();
- if (Next() == '=') {
+ do {
+ switch (Next()) {
+ case '-':
+ if (!v8_flags.js_regexp_modifiers) {
+ ReportError(RegExpError::kInvalidGroup);
+ return nullptr;
+ }
+ Advance();
+ parsing_modifiers = true;
+ if (modifiers_polarity == false) {
+ ReportError(RegExpError::kMultipleFlagDashes);
+ return nullptr;
+ }
+ modifiers_polarity = false;
+ break;
+ case 'm':
+ case 'i':
+ case 's': {
+ if (!v8_flags.js_regexp_modifiers) {
+ ReportError(RegExpError::kInvalidGroup);
+ return nullptr;
+ }
+ Advance();
+ parsing_modifiers = true;
+ RegExpFlag flag = TryRegExpFlagFromChar(current()).value();
+ if ((modifiers & flag) != 0) {
+ ReportError(RegExpError::kRepeatedFlag);
+ return nullptr;
+ }
+ modifiers |= flag;
+ flags.set(flag, modifiers_polarity);
+ break;
+ }
+ case ':':
+ Advance(2);
+ parsing_modifiers = false;
+ subexpr_type = GROUPING;
+ break;
+ case '=':
Advance(2);
- lookaround_type = RegExpLookaround::LOOKBEHIND;
+ parsing_modifiers = false;
+ lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = POSITIVE_LOOKAROUND;
break;
- } else if (Next() == '!') {
+ case '!':
Advance(2);
- lookaround_type = RegExpLookaround::LOOKBEHIND;
+ parsing_modifiers = false;
+ lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = NEGATIVE_LOOKAROUND;
break;
- }
- is_named_capture = true;
- has_named_captures_ = true;
- Advance();
- break;
- default:
- ReportError(RegExpError::kInvalidGroup);
- return nullptr;
+ case '<':
+ Advance();
+ parsing_modifiers = false;
+ if (Next() == '=') {
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ subexpr_type = POSITIVE_LOOKAROUND;
+ break;
+ } else if (Next() == '!') {
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ subexpr_type = NEGATIVE_LOOKAROUND;
+ break;
+ }
+ is_named_capture = true;
+ has_named_captures_ = true;
+ Advance();
+ break;
+ default:
+ ReportError(RegExpError::kInvalidGroup);
+ return nullptr;
+ }
+ } while (parsing_modifiers);
+ }
+ if (modifiers_polarity == false) {
+ // We encountered a dash.
+ if (modifiers == 0) {
+ ReportError(RegExpError::kInvalidFlagGroup);
+ return nullptr;
}
}
if (subexpr_type == CAPTURE) {
@@ -1299,7 +1381,7 @@ RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
// Store current state and begin new disjunction parsing.
return zone()->template New<RegExpParserState>(
state, subexpr_type, lookaround_type, captures_started_, capture_name,
- state->builder()->flags(), zone());
+ flags, zone());
}
// In order to know whether an escape is a backreference or not we have to scan
@@ -1511,7 +1593,10 @@ const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
template <class CharT>
bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
- const ZoneVector<base::uc16>* name, int index) {
+ const RegExpParserState* state, int index) {
+ const ZoneVector<base::uc16>* name = state->capture_name();
+ const std::pair<int, int> non_participating_capture_group_interval =
+ state->non_participating_capture_group_interval();
DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name);
@@ -1521,21 +1606,33 @@ bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
capture->set_name(name);
if (named_captures_ == nullptr) {
- named_captures_ =
- zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
- zone());
+ named_captures_ = zone_->template New<
+ ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone());
} else {
// Check for duplicates and bail if we find any.
-
const auto& named_capture_it = named_captures_->find(capture);
if (named_capture_it != named_captures_->end()) {
- ReportError(RegExpError::kDuplicateCaptureGroupName);
- return false;
+ if (v8_flags.js_regexp_duplicate_named_groups) {
+ ZoneList<int>* named_capture_indices = named_capture_it->second;
+ DCHECK_NOT_NULL(named_capture_indices);
+ DCHECK(!named_capture_indices->is_empty());
+ for (int named_index : *named_capture_indices) {
+ if (named_index < non_participating_capture_group_interval.first ||
+ named_index > non_participating_capture_group_interval.second) {
+ ReportError(RegExpError::kDuplicateCaptureGroupName);
+ return false;
+ }
+ }
+ } else {
+ ReportError(RegExpError::kDuplicateCaptureGroupName);
+ return false;
+ }
}
}
- named_captures_->emplace(capture);
-
+ auto entry = named_captures_->try_emplace(
+ capture, zone()->template New<ZoneList<int>>(1, zone()));
+ entry.first->second->Add(index, zone());
return true;
}
@@ -1558,7 +1655,7 @@ bool RegExpParserImpl<CharT>::ParseNamedBackReference(
builder->AddEmpty();
} else {
RegExpBackReference* atom =
- zone()->template New<RegExpBackReference>(builder->flags());
+ zone()->template New<RegExpBackReference>(zone());
atom->set_name(name);
builder->AddAtom(atom);
@@ -1595,16 +1692,17 @@ void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
DCHECK_NULL(search_capture->name());
search_capture->set_name(ref->name());
- int index = -1;
const auto& capture_it = named_captures_->find(search_capture);
- if (capture_it != named_captures_->end()) {
- index = (*capture_it)->index();
- } else {
+ if (capture_it == named_captures_->end()) {
ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
- ref->set_capture(GetCapture(index));
+ DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,
+ capture_it->second->length() == 1);
+ for (int index : *capture_it->second) {
+ ref->add_capture(GetCapture(index), zone());
+ }
}
}
@@ -1627,13 +1725,22 @@ RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
}
template <class CharT>
-ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
- if (named_captures_ == nullptr || named_captures_->empty()) {
+ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() {
+ if (named_captures_ == nullptr) {
return nullptr;
}
+ DCHECK(!named_captures_->empty());
- return zone()->template New<ZoneVector<RegExpCapture*>>(
- named_captures_->begin(), named_captures_->end(), zone());
+ ZoneVector<RegExpCapture*>* flattened_named_captures =
+ zone()->template New<ZoneVector<RegExpCapture*>>(zone());
+ for (auto capture : *named_captures_) {
+ DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,
+ capture.second->length() == 1);
+ for (int index : *capture.second) {
+ flattened_named_captures->push_back(GetCapture(index));
+ }
+ }
+ return flattened_named_captures;
}
template <class CharT>
@@ -1890,7 +1997,7 @@ bool LookupPropertyValueName(UProperty property,
ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
}
const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
- if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);
+ if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
set.removeAllStrings();
if (negate) set.complement();
for (int i = 0; i < set.getRangeCount(); i++) {
@@ -2096,13 +2203,22 @@ bool RegExpParserImpl<CharT>::AddPropertyClassRange(
if (!IsSupportedBinaryProperty(property, unicode_sets())) return false;
if (!IsExactPropertyAlias(name, property)) return false;
// Negation of properties with strings is not allowed.
- // TODO(v8:11935): Change permalink once proposal is in stage 4.
// See
- // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-static-semantics-maycontainstrings
+ // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings
if (negate && IsBinaryPropertyOfStrings(property)) return false;
- return LookupPropertyValueName(property, negate ? "N" : "Y", false,
- add_to_ranges, add_to_strings, flags(),
- zone());
+ if (unicode_sets()) {
+ // In /v mode we can't simple lookup the "false" binary property values,
+ // as the spec requires us to perform case folding before calculating the
+ // complement.
+ // See https://tc39.es/ecma262/#sec-compiletocharset
+ // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue
+ return LookupPropertyValueName(property, "Y", negate, add_to_ranges,
+ add_to_strings, flags(), zone());
+ } else {
+ return LookupPropertyValueName(property, negate ? "N" : "Y", false,
+ add_to_ranges, add_to_strings, flags(),
+ zone());
+ }
} else {
// Both property name and value name are specified. Attempt to interpret
// the property name as enumerated property.
@@ -2325,8 +2441,7 @@ base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
return c;
}
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassRanges
+// https://tc39.es/ecma262/#prod-ClassRanges
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(
ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) {
@@ -2475,8 +2590,7 @@ void AddClassString(ZoneList<base::uc32>* normalized_string,
} // namespace
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassStringDisjunction
+// https://tc39.es/ecma262/#prod-ClassStringDisjunction
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
@@ -2526,8 +2640,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
return nullptr;
}
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
+// https://tc39.es/ecma262/#prod-ClassSetOperand
// Tree returned based on type_out:
// * kNestedClass: RegExpClassSetExpression
// * For all other types: RegExpClassSetOperand
@@ -2538,12 +2651,13 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
zone()->template New<ZoneList<CharacterRange>>(1, zone());
CharacterClassStrings* strings =
zone()->template New<CharacterClassStrings>(zone());
- RegExpTree* tree =
- ParseClassSetOperand(builder, type_out, ranges, strings CHECK_FAILED);
+ base::uc32 character;
+ RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings,
+ &character CHECK_FAILED);
DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,
tree == nullptr);
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
- ranges->length() == 1);
+ ranges->is_empty());
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
strings->empty());
DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
@@ -2558,21 +2672,27 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
// CharacterClassEscape includes \p{}, which can contain ranges, strings or
// both and \P{}, which could contain nothing (i.e. \P{Any}).
if (tree == nullptr) {
+ if (*type_out == ClassSetOperandType::kClassSetCharacter) {
+ AddMaybeSimpleCaseFoldedRange(ranges,
+ CharacterRange::Singleton(character));
+ }
tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
}
return tree;
}
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
-// Based on |type_out| either a tree is returned or ranges/strings modified.
-// If a tree is returned, ranges/strings are not modified.
-// If |type_out| is kNestedClass, a tree of type RegExpClassSetExpression is
-// returned. For all other types, ranges is modified and nullptr is returned.
+// https://tc39.es/ecma262/#prod-ClassSetOperand
+// Based on |type_out| either a tree is returned or
+// |ranges|/|strings|/|character| modified. If a tree is returned,
+// ranges/strings are not modified. If |type_out| is kNestedClass, a tree of
+// type RegExpClassSetExpression is returned. If | type_out| is
+// kClassSetCharacter, |character| is set and nullptr returned. For all other
+// types, |ranges|/|strings|/|character| is modified and nullptr is returned.
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
const RegExpBuilder* builder, ClassSetOperandType* type_out,
- ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
+ ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
+ base::uc32* character) {
DCHECK(unicode_sets());
base::uc32 c = current();
if (c == '\\') {
@@ -2599,7 +2719,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
*type_out = ClassSetOperandType::kClassSetCharacter;
c = ParseClassSetCharacter(CHECK_FAILED);
- ranges->Add(CharacterRange::Singleton(c), zone());
+ *character = c;
return nullptr;
}
@@ -2653,13 +2773,28 @@ bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
} // namespace
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassUnion
+template <class CharT>
+void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange(
+ ZoneList<CharacterRange>* ranges, CharacterRange new_range) {
+ DCHECK(unicode_sets());
+ if (ignore_case()) {
+ ZoneList<CharacterRange>* new_ranges =
+ zone()->template New<ZoneList<CharacterRange>>(2, zone());
+ new_ranges->Add(new_range, zone());
+ CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone());
+ ranges->AddAll(*new_ranges, zone());
+ } else {
+ ranges->Add(new_range, zone());
+ }
+ CharacterRange::Canonicalize(ranges);
+}
+
+// https://tc39.es/ecma262/#prod-ClassUnion
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
- CharacterClassStrings* strings) {
+ CharacterClassStrings* strings, base::uc32 character) {
DCHECK(unicode_sets());
ZoneList<RegExpTree*>* operands =
zone()->template New<ZoneList<RegExpTree*>>(2, zone());
@@ -2673,7 +2808,6 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
operands->Add(first_operand, zone());
}
ClassSetOperandType last_type = first_operand_type;
- const bool needs_case_folding = ignore_case();
while (has_more() && current() != ']') {
if (current() == '-') {
// Mix of ClassSetRange and ClassSubtraction is not allowed.
@@ -2690,42 +2824,36 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
// represent a character range.
// In case one of them is not a ClassSetCharacter, it is a syntax error,
// as '-' can not be used unescaped within a class with /v.
- // TODO(v8:11935): Change permalink once proposal is in stage 4.
// See
- // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetRange
+ // https://tc39.es/ecma262/#prod-ClassSetRange
if (last_type != ClassSetOperandType::kClassSetCharacter) {
return ReportError(RegExpError::kInvalidCharacterClass);
}
- ParseClassSetOperand(builder, &last_type, ranges, strings CHECK_FAILED);
+ base::uc32 from = character;
+ ParseClassSetOperand(builder, &last_type, ranges, strings,
+ &character CHECK_FAILED);
if (last_type != ClassSetOperandType::kClassSetCharacter) {
return ReportError(RegExpError::kInvalidCharacterClass);
}
- // Remove the last two singleton characters added to ranges, and combine
- // them into a range.
- auto rhs_ranges = ranges->RemoveLast();
- auto lhs_ranges = ranges->RemoveLast();
- DCHECK(lhs_ranges.IsSingleton());
- DCHECK(rhs_ranges.IsSingleton());
- base::uc32 from = lhs_ranges.from();
- base::uc32 to = rhs_ranges.from();
- if (from > to) {
+ if (from > character) {
return ReportError(RegExpError::kOutOfOrderCharacterClass);
}
- ranges->Add(CharacterRange::Range(from, to), zone());
+ AddMaybeSimpleCaseFoldedRange(ranges,
+ CharacterRange::Range(from, character));
last_type = ClassSetOperandType::kClassSetRange;
} else {
DCHECK_NE(current(), '-');
- RegExpTree* operand = ParseClassSetOperand(builder, &last_type, ranges,
- strings CHECK_FAILED);
+ if (last_type == ClassSetOperandType::kClassSetCharacter) {
+ AddMaybeSimpleCaseFoldedRange(ranges,
+ CharacterRange::Singleton(character));
+ }
+ RegExpTree* operand = ParseClassSetOperand(
+ builder, &last_type, ranges, strings, &character CHECK_FAILED);
if (operand != nullptr) {
may_contain_strings |= MayContainStrings(last_type, operand);
// Add the range we started building as operand and reset the current
// range.
if (!ranges->is_empty() || !strings->empty()) {
- if (needs_case_folding) {
- CharacterRange::Canonicalize(ranges);
- CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
- }
may_contain_strings |= !strings->empty();
operands->Add(
zone()->template New<RegExpClassSetOperand>(ranges, strings),
@@ -2742,12 +2870,12 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
return ReportError(RegExpError::kUnterminatedCharacterClass);
}
+ if (last_type == ClassSetOperandType::kClassSetCharacter) {
+ AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character));
+ }
+
// Add the range we started building as operand.
if (!ranges->is_empty() || !strings->empty()) {
- if (needs_case_folding) {
- CharacterRange::Canonicalize(ranges);
- CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
- }
may_contain_strings |= !strings->empty();
operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),
zone());
@@ -2773,8 +2901,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
may_contain_strings, operands);
}
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassIntersection
+// https://tc39.es/ecma262/#prod-ClassIntersection
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
@@ -2815,8 +2942,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
may_contain_strings, operands);
}
-// TODO(v8:11935): Change permalink once proposal is in stage 4.
-// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSubtraction
+// https://tc39.es/ecma262/#prod-ClassSubtraction
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
@@ -2891,12 +3017,16 @@ RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
ClassSetOperandType operand_type;
CharacterClassStrings* strings =
zone()->template New<CharacterClassStrings>(zone());
- RegExpTree* operand = ParseClassSetOperand(builder, &operand_type, ranges,
- strings CHECK_FAILED);
+ base::uc32 character;
+ RegExpTree* operand = ParseClassSetOperand(
+ builder, &operand_type, ranges, strings, &character CHECK_FAILED);
switch (current()) {
case '-':
if (Next() == '-') {
if (operand == nullptr) {
+ if (operand_type == ClassSetOperandType::kClassSetCharacter) {
+ ranges->Add(CharacterRange::Singleton(character), zone());
+ }
operand =
zone()->template New<RegExpClassSetOperand>(ranges, strings);
}
@@ -2908,6 +3038,9 @@ RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
case '&':
if (Next() == '&') {
if (operand == nullptr) {
+ if (operand_type == ClassSetOperandType::kClassSetCharacter) {
+ ranges->Add(CharacterRange::Singleton(character), zone());
+ }
operand =
zone()->template New<RegExpClassSetOperand>(ranges, strings);
}
@@ -2916,7 +3049,7 @@ RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
}
}
return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
- strings);
+ strings, character);
}
}
@@ -3047,7 +3180,7 @@ bool RegExpBuilder::AddQuantifierToAtom(
RegExpTree* atom = text_builder().PopLastAtom();
if (atom != nullptr) {
FlushText();
- } else if (terms_.size() > 0) {
+ } else if (!terms_.empty()) {
atom = terms_.back();
terms_.pop_back();
if (atom->IsLookaround()) {
diff --git a/js/src/irregexp/imported/regexp.h b/js/src/irregexp/imported/regexp.h
index 50269a4b71..5dc9070ed9 100644
--- a/js/src/irregexp/imported/regexp.h
+++ b/js/src/irregexp/imported/regexp.h
@@ -87,8 +87,8 @@ class RegExp final : public AllStatic {
RegExpFlags flags, uint32_t backtrack_limit);
// Ensures that a regexp is fully compiled and ready to be executed on a
- // subject string. Returns true on success. Return false on failure, and
- // then an exception will be pending.
+ // subject string. Returns true on success. Throw and return false on
+ // failure.
V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> subject);
@@ -211,14 +211,16 @@ class RegExpResultsCache final : public AllStatic {
// Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
// On success, the returned result is guaranteed to be a COW-array.
- static Object Lookup(Heap* heap, String key_string, Object key_pattern,
- FixedArray* last_match_out, ResultsCacheType type);
+ static Tagged<Object> Lookup(Heap* heap, Tagged<String> key_string,
+ Tagged<Object> key_pattern,
+ Tagged<FixedArray>* last_match_out,
+ ResultsCacheType type);
// Attempt to add value_array to the cache specified by type. On success,
// value_array is turned into a COW-array.
static void Enter(Isolate* isolate, Handle<String> key_string,
Handle<Object> key_pattern, Handle<FixedArray> value_array,
Handle<FixedArray> last_match_cache, ResultsCacheType type);
- static void Clear(FixedArray cache);
+ static void Clear(Tagged<FixedArray> cache);
static constexpr int kRegExpResultsCacheSize = 0x100;
diff --git a/js/src/irregexp/imported/special-case.cc b/js/src/irregexp/imported/special-case.cc
index f5a9928b3a..d40ada6bb9 100644
--- a/js/src/irregexp/imported/special-case.cc
+++ b/js/src/irregexp/imported/special-case.cc
@@ -82,29 +82,6 @@ const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() {
return set.Pointer()->set;
}
-icu::UnicodeSet BuildUnicodeNonSimpleCloseOverSet() {
- icu::UnicodeSet set;
- set.add(0x390);
- set.add(0x3b0);
- set.add(0x1fd3);
- set.add(0x1fe3);
- set.add(0xfb05, 0xfb06);
- set.freeze();
- return set;
-}
-
-struct UnicodeNonSimpleCloseOverSetData {
- UnicodeNonSimpleCloseOverSetData() : set(BuildUnicodeNonSimpleCloseOverSet()) {}
- const icu::UnicodeSet set;
-};
-
-//static
-const icu::UnicodeSet& RegExpCaseFolding::UnicodeNonSimpleCloseOverSet() {
- static base::LazyInstance<UnicodeNonSimpleCloseOverSetData>::type set =
- LAZY_INSTANCE_INITIALIZER;
- return set.Pointer()->set;
-}
-
} // namespace internal
} // namespace v8
diff --git a/js/src/irregexp/imported/special-case.h b/js/src/irregexp/imported/special-case.h
index ea511af5a4..050d72a064 100644
--- a/js/src/irregexp/imported/special-case.h
+++ b/js/src/irregexp/imported/special-case.h
@@ -70,21 +70,11 @@ namespace internal {
// another character. Characters that match no other characters in
// their equivalence class are added to IgnoreSet. Characters that
// match at least one other character are added to SpecialAddSet.
-//
-// For unicode ignoreCase ("iu" and "iv"),
-// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in
-// the same equivalence class. This includes characaters that are in the same
-// equivalence class using full case folding. According to the spec, only
-// simple case folding shall be considered. We therefore create
-// UnicodeNonSimpleCloseOverSet containing all characters for which
-// UnicodeSet::closeOver adds characters that are not simple case folds. This
-// set should be used similar to IgnoreSet described above.
class RegExpCaseFolding final : public AllStatic {
public:
static const icu::UnicodeSet& IgnoreSet();
static const icu::UnicodeSet& SpecialAddSet();
- static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet();
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
// Canonicalize) step 3, which is used to determine whether
diff --git a/js/src/irregexp/moz.build b/js/src/irregexp/moz.build
index ff030ad4bd..2c363ad349 100644
--- a/js/src/irregexp/moz.build
+++ b/js/src/irregexp/moz.build
@@ -14,9 +14,13 @@ include("../js-cxxflags.mozbuild")
CXXFLAGS += ["-Wno-error=type-limits", "-Wno-error=return-type"]
-# Suppress spurious warnings in third-party code. See bug 1810584.
+# Suppress spurious warnings in third-party code.
+# See bug 1810584 and bug 1879225.
if CONFIG["CC_TYPE"] == "gcc":
- CXXFLAGS += ["-Wno-error=nonnull"]
+ CXXFLAGS += ["-Wno-error=nonnull", "-Wno-narrowing"]
+if CONFIG["CC_TYPE"] in ("clang", "clang-cl"):
+ CXXFLAGS += ["-Wno-c++11-narrowing"]
+
UNIFIED_SOURCES += [
"imported/regexp-bytecode-generator.cc",
diff --git a/js/src/irregexp/moz.yaml b/js/src/irregexp/moz.yaml
index e230a89cfd..ca44833c24 100644
--- a/js/src/irregexp/moz.yaml
+++ b/js/src/irregexp/moz.yaml
@@ -9,8 +9,8 @@ origin:
description: A fast regular expression engine from V8
url: https://v8.dev
- release: 30a887aeb92153885619d8bb9fa57cda7adf9276 (Thu Jul 06 11:42:30 2023).
- revision: 30a887aeb92153885619d8bb9fa57cda7adf9276
+ release: e50ab13bbfaaf72717fd73d9a01434e4c3c1a0a8 (Thu Feb 29 03:38:59 2024).
+ revision: e50ab13bbfaaf72717fd73d9a01434e4c3c1a0a8
license: BSD-3-Clause
license-file: LICENSE.v8