summaryrefslogtreecommitdiffstats
path: root/js/src/irregexp/imported
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/irregexp/imported')
-rw-r--r--js/src/irregexp/imported/gen-regexp-special-case.cc214
-rw-r--r--js/src/irregexp/imported/property-sequences.cc1246
-rw-r--r--js/src/irregexp/imported/property-sequences.h27
-rw-r--r--js/src/irregexp/imported/regexp-ast.cc432
-rw-r--r--js/src/irregexp/imported/regexp-ast.h735
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-generator-inl.h55
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-generator.cc405
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-generator.h140
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-peephole.cc1027
-rw-r--r--js/src/irregexp/imported/regexp-bytecode-peephole.h30
-rw-r--r--js/src/irregexp/imported/regexp-bytecodes.cc46
-rw-r--r--js/src/irregexp/imported/regexp-bytecodes.h257
-rw-r--r--js/src/irregexp/imported/regexp-compiler-tonode.cc2042
-rw-r--r--js/src/irregexp/imported/regexp-compiler.cc3955
-rw-r--r--js/src/irregexp/imported/regexp-compiler.h621
-rw-r--r--js/src/irregexp/imported/regexp-dotprinter.cc249
-rw-r--r--js/src/irregexp/imported/regexp-dotprinter.h23
-rw-r--r--js/src/irregexp/imported/regexp-error.cc22
-rw-r--r--js/src/irregexp/imported/regexp-error.h67
-rw-r--r--js/src/irregexp/imported/regexp-interpreter.cc1147
-rw-r--r--js/src/irregexp/imported/regexp-interpreter.h68
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler-arch.h7
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler-tracer.cc438
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler-tracer.h90
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler.cc520
-rw-r--r--js/src/irregexp/imported/regexp-macro-assembler.h361
-rw-r--r--js/src/irregexp/imported/regexp-nodes.h775
-rw-r--r--js/src/irregexp/imported/regexp-parser.cc3131
-rw-r--r--js/src/irregexp/imported/regexp-parser.h34
-rw-r--r--js/src/irregexp/imported/regexp-stack.cc96
-rw-r--r--js/src/irregexp/imported/regexp-stack.h159
-rw-r--r--js/src/irregexp/imported/regexp.h236
-rw-r--r--js/src/irregexp/imported/special-case.cc111
-rw-r--r--js/src/irregexp/imported/special-case.h127
34 files changed, 18893 insertions, 0 deletions
diff --git a/js/src/irregexp/imported/gen-regexp-special-case.cc b/js/src/irregexp/imported/gen-regexp-special-case.cc
new file mode 100644
index 0000000000..8f6557ed30
--- /dev/null
+++ b/js/src/irregexp/imported/gen-regexp-special-case.cc
@@ -0,0 +1,214 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "irregexp/imported/special-case.h"
+#include "unicode/usetiter.h"
+
+namespace v8 {
+namespace internal {
+
+static const base::uc32 kSurrogateStart = 0xd800;
+static const base::uc32 kSurrogateEnd = 0xdfff;
+static const base::uc32 kNonBmpStart = 0x10000;
+
+// The following code generates "src/regexp/special-case.cc".
+void PrintSet(std::ofstream& out, const char* name,
+ const icu::UnicodeSet& set) {
+ out << "icu::UnicodeSet Build" << name << "() {\n"
+ << " icu::UnicodeSet set;\n";
+ for (int32_t i = 0; i < set.getRangeCount(); i++) {
+ if (set.getRangeStart(i) == set.getRangeEnd(i)) {
+ out << " set.add(0x" << set.getRangeStart(i) << ");\n";
+ } else {
+ out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
+ << set.getRangeEnd(i) << ");\n";
+ }
+ }
+ out << " set.freeze();\n"
+ << " return set;\n"
+ << "}\n\n";
+
+ out << "struct " << name << "Data {\n"
+ << " " << name << "Data() : set(Build" << name << "()) {}\n"
+ << " const icu::UnicodeSet set;\n"
+ << "};\n\n";
+
+ out << "//static\n"
+ << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
+ << " static base::LazyInstance<" << name << "Data>::type set =\n"
+ << " LAZY_INSTANCE_INITIALIZER;\n"
+ << " return set.Pointer()->set;\n"
+ << "}\n\n";
+}
+
+void PrintSpecial(std::ofstream& out) {
+ icu::UnicodeSet current;
+ icu::UnicodeSet special_add;
+ icu::UnicodeSet ignore;
+ UErrorCode status = U_ZERO_ERROR;
+ icu::UnicodeSet upper("[\\p{Lu}]", status);
+ CHECK(U_SUCCESS(status));
+
+ // Iterate through all chars in BMP except surrogates.
+ for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
+ if (i >= static_cast<UChar32>(kSurrogateStart) &&
+ i <= static_cast<UChar32>(kSurrogateEnd)) {
+ continue; // Ignore surrogate range
+ }
+ current.set(i, i);
+ current.closeOver(USET_CASE_INSENSITIVE);
+
+ // Check to see if all characters in the case-folding equivalence
+ // class as defined by UnicodeSet::closeOver all map to the same
+ // canonical value.
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
+ bool class_has_matching_canonical_char = false;
+ bool class_has_non_matching_canonical_char = false;
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
+ c++) {
+ if (c == i) {
+ continue;
+ }
+ UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
+ if (canonical == other_canonical) {
+ class_has_matching_canonical_char = true;
+ } else {
+ class_has_non_matching_canonical_char = true;
+ }
+ }
+ }
+ // If any other character in i's equivalence class has a
+ // different canonical value, then i needs special handling. If
+ // no other character shares a canonical value with i, we can
+ // ignore i when adding alternatives for case-independent
+ // comparison. If at least one other character shares a
+ // canonical value, then i needs special handling.
+ if (class_has_non_matching_canonical_char) {
+ if (class_has_matching_canonical_char) {
+ special_add.add(i);
+ } else {
+ ignore.add(i);
+ }
+ }
+ }
+
+ // Verify that no Unicode equivalence class contains two non-trivial
+ // JS equivalence classes. Every character in SpecialAddSet has the
+ // same canonical value as every other non-IgnoreSet character in
+ // its Unicode equivalence class. Therefore, if we call closeOver on
+ // a set containing no IgnoreSet characters, the only characters
+ // that must be removed from the result are in IgnoreSet. This fact
+ // is used in CharacterRange::AddCaseEquivalents.
+ for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
+ for (UChar32 c = special_add.getRangeStart(i);
+ c <= special_add.getRangeEnd(i); c++) {
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
+ current.set(c, c);
+ current.closeOver(USET_CASE_INSENSITIVE);
+ current.removeAll(ignore);
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c2 = current.getRangeStart(j);
+ c2 <= current.getRangeEnd(j); c2++) {
+ CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
+ }
+ }
+ }
+ }
+
+ PrintSet(out, "IgnoreSet", ignore);
+ PrintSet(out, "SpecialAddSet", special_add);
+}
+
+void PrintUnicodeSpecial(std::ofstream& out) {
+ icu::UnicodeSet non_simple_folding;
+ icu::UnicodeSet current;
+ UErrorCode status = U_ZERO_ERROR;
+ // Look at all characters except white spaces.
+ icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status);
+ CHECK_EQ(status, U_ZERO_ERROR);
+ icu::UnicodeSetIterator iter(interestingCP);
+ while (iter.next()) {
+ UChar32 c = iter.getCodepoint();
+ current.set(c, c);
+ current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings();
+ CHECK(!current.isBogus());
+ // Remove characters from the closeover that have a simple case folding.
+ icu::UnicodeSet toRemove;
+ icu::UnicodeSetIterator closeOverIter(current);
+ while (closeOverIter.next()) {
+ UChar32 closeOverChar = closeOverIter.getCodepoint();
+ UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT);
+ if (closeOverChar != closeOverSCF) {
+ toRemove.add(closeOverChar);
+ }
+ }
+ CHECK(!toRemove.isBogus());
+ current.removeAll(toRemove);
+
+ // The current character and its simple case folding are also always OK.
+ UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+ current.remove(c);
+ current.remove(scf);
+
+ // If there are any characters remaining, they were added due to full case
+ // foldings and shouldn't match the current charcter according to the spec.
+ if (!current.isEmpty()) {
+ // Ensure that the character doesn't have a simple case folding.
+ // Otherwise the current approach of simply removing the character from
+ // the set before calling closeOver won't work.
+ CHECK_EQ(c, scf);
+ non_simple_folding.add(c);
+ }
+ }
+ CHECK(!non_simple_folding.isBogus());
+
+ PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding);
+}
+
+void WriteHeader(const char* header_filename) {
+ std::ofstream out(header_filename);
+ out << std::hex << std::setfill('0') << std::setw(4);
+ out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
+ << "// Use of this source code is governed by a BSD-style license that\n"
+ << "// can be found in the LICENSE file.\n\n"
+ << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
+ << "// The following functions are used to build UnicodeSets\n"
+ << "// for special cases where the case-folding algorithm used by\n"
+ << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
+ << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
+ << "// Semantics: Canonicalize) step 3.\n\n"
+ << "#ifdef V8_INTL_SUPPORT\n"
+ << "#include \"src/base/lazy-instance.h\"\n\n"
+ << "#include \"src/regexp/special-case.h\"\n\n"
+ << "#include \"unicode/uniset.h\"\n"
+ << "namespace v8 {\n"
+ << "namespace internal {\n\n";
+
+ PrintSpecial(out);
+ PrintUnicodeSpecial(out);
+
+ out << "\n"
+ << "} // namespace internal\n"
+ << "} // namespace v8\n"
+ << "#endif // V8_INTL_SUPPORT\n";
+}
+
+} // namespace internal
+} // namespace v8
+
+int main(int argc, const char** argv) {
+ if (argc != 2) {
+ std::cerr << "Usage: " << argv[0] << " <output filename>\n";
+ std::exit(1);
+ }
+ v8::internal::WriteHeader(argv[1]);
+
+ return 0;
+}
diff --git a/js/src/irregexp/imported/property-sequences.cc b/js/src/irregexp/imported/property-sequences.cc
new file mode 100644
index 0000000000..b37ec63115
--- /dev/null
+++ b/js/src/irregexp/imported/property-sequences.cc
@@ -0,0 +1,1246 @@
+// Copyright 2018 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifdef V8_INTL_SUPPORT
+
+#include "irregexp/imported/property-sequences.h"
+
+namespace v8 {
+namespace internal {
+
+/*
+Generated from following Node.js source:
+
+package.json
+
+```
+{
+ "private": true,
+ "dependencies": {
+ "unicode-12.0.0": "^0.7.9"
+ }
+}
+```
+
+generate-unicode-sequence-property-data.js
+
+```
+const toHex = (symbol) => {
+ return '0x' + symbol.codePointAt(0).toString(16)
+ .toUpperCase().padStart(6, '0');
+};
+
+const generateData = (property) => {
+ const sequences =
+ require(`unicode-12.0.0/Sequence_Property/${ property }/index.js`);
+ const id = property.replace(/_/g, '') + 's';
+ const buffer = [];
+ for (const sequence of sequences) {
+ const symbols = [...sequence];
+ const codePoints = symbols.map(symbol => toHex(symbol));
+ buffer.push(' ' + codePoints.join(', ') + ', 0,');
+ }
+ const output =
+ `const base::uc32 UnicodePropertySequences::k${ id }[] = {\n` +
+ `${ buffer.join('\n') }\n 0 // null-terminating the list\n};\n`;
+ return output;
+};
+
+const properties = [
+ 'Emoji_Flag_Sequence',
+ 'Emoji_Tag_Sequence',
+ 'Emoji_ZWJ_Sequence',
+];
+
+for (const property of properties) {
+ console.log(generateData(property));
+}
+```
+*/
+
+// clang-format off
+const base::uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
+ 0x01F1E6, 0x01F1E8, 0,
+ 0x01F1FF, 0x01F1FC, 0,
+ 0x01F1E6, 0x01F1EA, 0,
+ 0x01F1E6, 0x01F1EB, 0,
+ 0x01F1E6, 0x01F1EC, 0,
+ 0x01F1E6, 0x01F1EE, 0,
+ 0x01F1E6, 0x01F1F1, 0,
+ 0x01F1E6, 0x01F1F2, 0,
+ 0x01F1E6, 0x01F1F4, 0,
+ 0x01F1E6, 0x01F1F6, 0,
+ 0x01F1E6, 0x01F1F7, 0,
+ 0x01F1E6, 0x01F1F8, 0,
+ 0x01F1E6, 0x01F1F9, 0,
+ 0x01F1E6, 0x01F1FA, 0,
+ 0x01F1E6, 0x01F1FC, 0,
+ 0x01F1E6, 0x01F1FD, 0,
+ 0x01F1E6, 0x01F1FF, 0,
+ 0x01F1E7, 0x01F1E6, 0,
+ 0x01F1E7, 0x01F1E7, 0,
+ 0x01F1E7, 0x01F1E9, 0,
+ 0x01F1E7, 0x01F1EA, 0,
+ 0x01F1E7, 0x01F1EB, 0,
+ 0x01F1E7, 0x01F1EC, 0,
+ 0x01F1E7, 0x01F1ED, 0,
+ 0x01F1E7, 0x01F1EE, 0,
+ 0x01F1E7, 0x01F1EF, 0,
+ 0x01F1E7, 0x01F1F1, 0,
+ 0x01F1E7, 0x01F1F2, 0,
+ 0x01F1E7, 0x01F1F3, 0,
+ 0x01F1E7, 0x01F1F4, 0,
+ 0x01F1E7, 0x01F1F6, 0,
+ 0x01F1E7, 0x01F1F7, 0,
+ 0x01F1E7, 0x01F1F8, 0,
+ 0x01F1E7, 0x01F1F9, 0,
+ 0x01F1E7, 0x01F1FB, 0,
+ 0x01F1E7, 0x01F1FC, 0,
+ 0x01F1E7, 0x01F1FE, 0,
+ 0x01F1E7, 0x01F1FF, 0,
+ 0x01F1E8, 0x01F1E6, 0,
+ 0x01F1E8, 0x01F1E8, 0,
+ 0x01F1E8, 0x01F1E9, 0,
+ 0x01F1E8, 0x01F1EB, 0,
+ 0x01F1E8, 0x01F1EC, 0,
+ 0x01F1E8, 0x01F1ED, 0,
+ 0x01F1E8, 0x01F1EE, 0,
+ 0x01F1E8, 0x01F1F0, 0,
+ 0x01F1E8, 0x01F1F1, 0,
+ 0x01F1E8, 0x01F1F2, 0,
+ 0x01F1E8, 0x01F1F3, 0,
+ 0x01F1E8, 0x01F1F4, 0,
+ 0x01F1E8, 0x01F1F5, 0,
+ 0x01F1E8, 0x01F1F7, 0,
+ 0x01F1E8, 0x01F1FA, 0,
+ 0x01F1E8, 0x01F1FB, 0,
+ 0x01F1E8, 0x01F1FC, 0,
+ 0x01F1E8, 0x01F1FD, 0,
+ 0x01F1E8, 0x01F1FE, 0,
+ 0x01F1E8, 0x01F1FF, 0,
+ 0x01F1E9, 0x01F1EA, 0,
+ 0x01F1E9, 0x01F1EC, 0,
+ 0x01F1E9, 0x01F1EF, 0,
+ 0x01F1E9, 0x01F1F0, 0,
+ 0x01F1E9, 0x01F1F2, 0,
+ 0x01F1E9, 0x01F1F4, 0,
+ 0x01F1E9, 0x01F1FF, 0,
+ 0x01F1EA, 0x01F1E6, 0,
+ 0x01F1EA, 0x01F1E8, 0,
+ 0x01F1EA, 0x01F1EA, 0,
+ 0x01F1EA, 0x01F1EC, 0,
+ 0x01F1EA, 0x01F1ED, 0,
+ 0x01F1EA, 0x01F1F7, 0,
+ 0x01F1EA, 0x01F1F8, 0,
+ 0x01F1EA, 0x01F1F9, 0,
+ 0x01F1EA, 0x01F1FA, 0,
+ 0x01F1EB, 0x01F1EE, 0,
+ 0x01F1EB, 0x01F1EF, 0,
+ 0x01F1EB, 0x01F1F0, 0,
+ 0x01F1EB, 0x01F1F2, 0,
+ 0x01F1EB, 0x01F1F4, 0,
+ 0x01F1EB, 0x01F1F7, 0,
+ 0x01F1EC, 0x01F1E6, 0,
+ 0x01F1EC, 0x01F1E7, 0,
+ 0x01F1EC, 0x01F1E9, 0,
+ 0x01F1EC, 0x01F1EA, 0,
+ 0x01F1EC, 0x01F1EB, 0,
+ 0x01F1EC, 0x01F1EC, 0,
+ 0x01F1EC, 0x01F1ED, 0,
+ 0x01F1EC, 0x01F1EE, 0,
+ 0x01F1EC, 0x01F1F1, 0,
+ 0x01F1EC, 0x01F1F2, 0,
+ 0x01F1EC, 0x01F1F3, 0,
+ 0x01F1EC, 0x01F1F5, 0,
+ 0x01F1EC, 0x01F1F6, 0,
+ 0x01F1EC, 0x01F1F7, 0,
+ 0x01F1EC, 0x01F1F8, 0,
+ 0x01F1EC, 0x01F1F9, 0,
+ 0x01F1EC, 0x01F1FA, 0,
+ 0x01F1EC, 0x01F1FC, 0,
+ 0x01F1EC, 0x01F1FE, 0,
+ 0x01F1ED, 0x01F1F0, 0,
+ 0x01F1ED, 0x01F1F2, 0,
+ 0x01F1ED, 0x01F1F3, 0,
+ 0x01F1ED, 0x01F1F7, 0,
+ 0x01F1ED, 0x01F1F9, 0,
+ 0x01F1ED, 0x01F1FA, 0,
+ 0x01F1EE, 0x01F1E8, 0,
+ 0x01F1EE, 0x01F1E9, 0,
+ 0x01F1EE, 0x01F1EA, 0,
+ 0x01F1EE, 0x01F1F1, 0,
+ 0x01F1EE, 0x01F1F2, 0,
+ 0x01F1EE, 0x01F1F3, 0,
+ 0x01F1EE, 0x01F1F4, 0,
+ 0x01F1EE, 0x01F1F6, 0,
+ 0x01F1EE, 0x01F1F7, 0,
+ 0x01F1EE, 0x01F1F8, 0,
+ 0x01F1EE, 0x01F1F9, 0,
+ 0x01F1EF, 0x01F1EA, 0,
+ 0x01F1EF, 0x01F1F2, 0,
+ 0x01F1EF, 0x01F1F4, 0,
+ 0x01F1EF, 0x01F1F5, 0,
+ 0x01F1F0, 0x01F1EA, 0,
+ 0x01F1F0, 0x01F1EC, 0,
+ 0x01F1F0, 0x01F1ED, 0,
+ 0x01F1F0, 0x01F1EE, 0,
+ 0x01F1F0, 0x01F1F2, 0,
+ 0x01F1F0, 0x01F1F3, 0,
+ 0x01F1F0, 0x01F1F5, 0,
+ 0x01F1F0, 0x01F1F7, 0,
+ 0x01F1F0, 0x01F1FC, 0,
+ 0x01F1E6, 0x01F1E9, 0,
+ 0x01F1F0, 0x01F1FF, 0,
+ 0x01F1F1, 0x01F1E6, 0,
+ 0x01F1F1, 0x01F1E7, 0,
+ 0x01F1F1, 0x01F1E8, 0,
+ 0x01F1F1, 0x01F1EE, 0,
+ 0x01F1F1, 0x01F1F0, 0,
+ 0x01F1F1, 0x01F1F7, 0,
+ 0x01F1F1, 0x01F1F8, 0,
+ 0x01F1F1, 0x01F1F9, 0,
+ 0x01F1F1, 0x01F1FA, 0,
+ 0x01F1F1, 0x01F1FB, 0,
+ 0x01F1F1, 0x01F1FE, 0,
+ 0x01F1F2, 0x01F1E6, 0,
+ 0x01F1F2, 0x01F1E8, 0,
+ 0x01F1F2, 0x01F1E9, 0,
+ 0x01F1F2, 0x01F1EA, 0,
+ 0x01F1F2, 0x01F1EB, 0,
+ 0x01F1F2, 0x01F1EC, 0,
+ 0x01F1F2, 0x01F1ED, 0,
+ 0x01F1F2, 0x01F1F0, 0,
+ 0x01F1F2, 0x01F1F1, 0,
+ 0x01F1F2, 0x01F1F2, 0,
+ 0x01F1F2, 0x01F1F3, 0,
+ 0x01F1F2, 0x01F1F4, 0,
+ 0x01F1F2, 0x01F1F5, 0,
+ 0x01F1F2, 0x01F1F6, 0,
+ 0x01F1F2, 0x01F1F7, 0,
+ 0x01F1F2, 0x01F1F8, 0,
+ 0x01F1F2, 0x01F1F9, 0,
+ 0x01F1F2, 0x01F1FA, 0,
+ 0x01F1F2, 0x01F1FB, 0,
+ 0x01F1F2, 0x01F1FC, 0,
+ 0x01F1F2, 0x01F1FD, 0,
+ 0x01F1F2, 0x01F1FE, 0,
+ 0x01F1F2, 0x01F1FF, 0,
+ 0x01F1F3, 0x01F1E6, 0,
+ 0x01F1F3, 0x01F1E8, 0,
+ 0x01F1F3, 0x01F1EA, 0,
+ 0x01F1F3, 0x01F1EB, 0,
+ 0x01F1F3, 0x01F1EC, 0,
+ 0x01F1F3, 0x01F1EE, 0,
+ 0x01F1F3, 0x01F1F1, 0,
+ 0x01F1F3, 0x01F1F4, 0,
+ 0x01F1F3, 0x01F1F5, 0,
+ 0x01F1F3, 0x01F1F7, 0,
+ 0x01F1F3, 0x01F1FA, 0,
+ 0x01F1F3, 0x01F1FF, 0,
+ 0x01F1F4, 0x01F1F2, 0,
+ 0x01F1F5, 0x01F1E6, 0,
+ 0x01F1F5, 0x01F1EA, 0,
+ 0x01F1F5, 0x01F1EB, 0,
+ 0x01F1F5, 0x01F1EC, 0,
+ 0x01F1F5, 0x01F1ED, 0,
+ 0x01F1F5, 0x01F1F0, 0,
+ 0x01F1F5, 0x01F1F1, 0,
+ 0x01F1F5, 0x01F1F2, 0,
+ 0x01F1F5, 0x01F1F3, 0,
+ 0x01F1F5, 0x01F1F7, 0,
+ 0x01F1F5, 0x01F1F8, 0,
+ 0x01F1F5, 0x01F1F9, 0,
+ 0x01F1F5, 0x01F1FC, 0,
+ 0x01F1F5, 0x01F1FE, 0,
+ 0x01F1F6, 0x01F1E6, 0,
+ 0x01F1F7, 0x01F1EA, 0,
+ 0x01F1F7, 0x01F1F4, 0,
+ 0x01F1F7, 0x01F1F8, 0,
+ 0x01F1F7, 0x01F1FA, 0,
+ 0x01F1F7, 0x01F1FC, 0,
+ 0x01F1F8, 0x01F1E6, 0,
+ 0x01F1F8, 0x01F1E7, 0,
+ 0x01F1F8, 0x01F1E8, 0,
+ 0x01F1F8, 0x01F1E9, 0,
+ 0x01F1F8, 0x01F1EA, 0,
+ 0x01F1F8, 0x01F1EC, 0,
+ 0x01F1F8, 0x01F1ED, 0,
+ 0x01F1F8, 0x01F1EE, 0,
+ 0x01F1F8, 0x01F1EF, 0,
+ 0x01F1F8, 0x01F1F0, 0,
+ 0x01F1F8, 0x01F1F1, 0,
+ 0x01F1F8, 0x01F1F2, 0,
+ 0x01F1F8, 0x01F1F3, 0,
+ 0x01F1F8, 0x01F1F4, 0,
+ 0x01F1F8, 0x01F1F7, 0,
+ 0x01F1F8, 0x01F1F8, 0,
+ 0x01F1F8, 0x01F1F9, 0,
+ 0x01F1F8, 0x01F1FB, 0,
+ 0x01F1F8, 0x01F1FD, 0,
+ 0x01F1F8, 0x01F1FE, 0,
+ 0x01F1F8, 0x01F1FF, 0,
+ 0x01F1F9, 0x01F1E6, 0,
+ 0x01F1F9, 0x01F1E8, 0,
+ 0x01F1F9, 0x01F1E9, 0,
+ 0x01F1F9, 0x01F1EB, 0,
+ 0x01F1F9, 0x01F1EC, 0,
+ 0x01F1F9, 0x01F1ED, 0,
+ 0x01F1F9, 0x01F1EF, 0,
+ 0x01F1F9, 0x01F1F0, 0,
+ 0x01F1F9, 0x01F1F1, 0,
+ 0x01F1F9, 0x01F1F2, 0,
+ 0x01F1F9, 0x01F1F3, 0,
+ 0x01F1F9, 0x01F1F4, 0,
+ 0x01F1F9, 0x01F1F7, 0,
+ 0x01F1F9, 0x01F1F9, 0,
+ 0x01F1F9, 0x01F1FB, 0,
+ 0x01F1F9, 0x01F1FC, 0,
+ 0x01F1F9, 0x01F1FF, 0,
+ 0x01F1FA, 0x01F1E6, 0,
+ 0x01F1FA, 0x01F1EC, 0,
+ 0x01F1FA, 0x01F1F2, 0,
+ 0x01F1FA, 0x01F1F3, 0,
+ 0x01F1FA, 0x01F1F8, 0,
+ 0x01F1FA, 0x01F1FE, 0,
+ 0x01F1FA, 0x01F1FF, 0,
+ 0x01F1FB, 0x01F1E6, 0,
+ 0x01F1FB, 0x01F1E8, 0,
+ 0x01F1FB, 0x01F1EA, 0,
+ 0x01F1FB, 0x01F1EC, 0,
+ 0x01F1FB, 0x01F1EE, 0,
+ 0x01F1FB, 0x01F1F3, 0,
+ 0x01F1FB, 0x01F1FA, 0,
+ 0x01F1FC, 0x01F1EB, 0,
+ 0x01F1FC, 0x01F1F8, 0,
+ 0x01F1FD, 0x01F1F0, 0,
+ 0x01F1FE, 0x01F1EA, 0,
+ 0x01F1FE, 0x01F1F9, 0,
+ 0x01F1FF, 0x01F1E6, 0,
+ 0x01F1FF, 0x01F1F2, 0,
+ 0x01F1F0, 0x01F1FE, 0,
+ 0 // null-terminating the list
+};
+
+const base::uc32 UnicodePropertySequences::kEmojiTagSequences[] = {
+ 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0065, 0x0E006E, 0x0E0067, 0x0E007F, 0,
+ 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0073, 0x0E0063, 0x0E0074, 0x0E007F, 0,
+ 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0077, 0x0E006C, 0x0E0073, 0x0E007F, 0,
+ 0 // null-terminating the list
+};
+
+const base::uc32 UnicodePropertySequences::kEmojiZWJSequences[] = {
+ 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0,
+ 0x01F441, 0x00FE0F, 0x00200D, 0x01F5E8, 0x00FE0F, 0,
+ 0x01F468, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F467, 0,
+ 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0,
+ 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0,
+ 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0,
+ 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0,
+ 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0,
+ 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0,
+ 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0,
+ 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F469, 0,
+ 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D,
+ 0x01F468, 0,
+ 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D,
+ 0x01F469, 0,
+ 0x01F469, 0x00200D, 0x01F466, 0,
+ 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0,
+ 0x01F469, 0x00200D, 0x01F467, 0,
+ 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0,
+ 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0,
+ 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0,
+ 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0,
+ 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0,
+ 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0,
+ 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FD, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FD, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FE, 0,
+ 0x01F9D1, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0,
+ 0x01F9D1, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0,
+ 0x01F9D1, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0,
+ 0x01F9D1, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0,
+ 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0,
+ 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0,
+ 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0,
+ 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0,
+ 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0,
+ 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0,
+ 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FE, 0,
+ 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0,
+ 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0,
+ 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0,
+ 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FE, 0,
+ 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FF, 0,
+ 0x01F468, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F468, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F468, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F468, 0x00200D, 0x01F33E, 0,
+ 0x01F468, 0x00200D, 0x01F373, 0,
+ 0x01F468, 0x00200D, 0x01F393, 0,
+ 0x01F468, 0x00200D, 0x01F3A4, 0,
+ 0x01F468, 0x00200D, 0x01F3A8, 0,
+ 0x01F468, 0x00200D, 0x01F3EB, 0,
+ 0x01F468, 0x00200D, 0x01F3ED, 0,
+ 0x01F468, 0x00200D, 0x01F4BB, 0,
+ 0x01F468, 0x00200D, 0x01F4BC, 0,
+ 0x01F468, 0x00200D, 0x01F527, 0,
+ 0x01F468, 0x00200D, 0x01F52C, 0,
+ 0x01F468, 0x00200D, 0x01F680, 0,
+ 0x01F468, 0x00200D, 0x01F692, 0,
+ 0x01F468, 0x00200D, 0x01F9AF, 0,
+ 0x01F468, 0x00200D, 0x01F9BC, 0,
+ 0x01F468, 0x00200D, 0x01F9BD, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F33E, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F373, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F393, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F3A4, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F3A8, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F3EB, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F3ED, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F4BB, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F4BC, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F527, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F52C, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F680, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F692, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9AF, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9BC, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9BD, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F33E, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F373, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F393, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F3A4, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F3A8, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F3EB, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F3ED, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F4BB, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F4BC, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F527, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F52C, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F680, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F692, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9AF, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9BC, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9BD, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F33E, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F373, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F393, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F3A4, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F3A8, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F3EB, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F3ED, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F4BB, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F4BC, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F527, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F52C, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F680, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F692, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9AF, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9BC, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9BD, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F33E, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F373, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F393, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F3A4, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F3A8, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F3EB, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F3ED, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F4BB, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F4BC, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F527, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F52C, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F680, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F692, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9AF, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9BC, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9BD, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F33E, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F373, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F393, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F3A4, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F3A8, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F3EB, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F3ED, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F4BB, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F4BC, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F527, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F52C, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F680, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F692, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9AF, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9BC, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9BD, 0,
+ 0x01F469, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F469, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F469, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F469, 0x00200D, 0x01F33E, 0,
+ 0x01F469, 0x00200D, 0x01F373, 0,
+ 0x01F469, 0x00200D, 0x01F393, 0,
+ 0x01F469, 0x00200D, 0x01F3A4, 0,
+ 0x01F469, 0x00200D, 0x01F3A8, 0,
+ 0x01F469, 0x00200D, 0x01F3EB, 0,
+ 0x01F469, 0x00200D, 0x01F3ED, 0,
+ 0x01F469, 0x00200D, 0x01F4BB, 0,
+ 0x01F469, 0x00200D, 0x01F4BC, 0,
+ 0x01F469, 0x00200D, 0x01F527, 0,
+ 0x01F469, 0x00200D, 0x01F52C, 0,
+ 0x01F469, 0x00200D, 0x01F680, 0,
+ 0x01F469, 0x00200D, 0x01F692, 0,
+ 0x01F469, 0x00200D, 0x01F9AF, 0,
+ 0x01F469, 0x00200D, 0x01F9BC, 0,
+ 0x01F469, 0x00200D, 0x01F9BD, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F33E, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F373, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F393, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F3A4, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F3A8, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F3EB, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F3ED, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F4BB, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F4BC, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F527, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F52C, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F680, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F692, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9AF, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9BC, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9BD, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F33E, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F373, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F393, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F3A4, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F3A8, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F3EB, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F3ED, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F4BB, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F4BC, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F527, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F52C, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F680, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F692, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9AF, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9BC, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9BD, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F33E, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F373, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F393, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F3A4, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F3A8, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F3EB, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F3ED, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F4BB, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F4BC, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F527, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F52C, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F680, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F692, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9AF, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9BC, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9BD, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F33E, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F373, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F393, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F3A4, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F3A8, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F3EB, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F3ED, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F4BB, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F4BC, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F527, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F52C, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F680, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F692, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9AF, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9BC, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9BD, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x002695, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x002696, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x002708, 0x00FE0F, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F33E, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F373, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F393, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F3A4, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F3A8, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F3EB, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F3ED, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F4BB, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F4BC, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F527, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F52C, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F680, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F692, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9AF, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9BC, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9BD, 0,
+ 0x0026F9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x0026F9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x0026F9, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x0026F9, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C3, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C3, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C3, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C4, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C4, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3C4, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CA, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CA, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CA, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CB, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CB, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CB, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CC, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F3CC, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F3CC, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46E, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46E, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F46F, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F46F, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F471, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F471, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F471, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F473, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F473, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F473, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F477, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F477, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F477, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F481, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F481, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F481, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F482, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F482, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D,
+ 0x01F468, 0,
+ 0x01F482, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F482, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F486, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F486, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F486, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F487, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F487, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F487, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F575, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F575, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F575, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F645, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F645, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F645, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F646, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F646, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F646, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F647, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F647, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F647, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64B, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64B, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64B, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64D, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64D, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64D, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64E, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64E, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F64E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6A3, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6A3, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6A3, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B4, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B4, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B4, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B5, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B5, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B5, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B6, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B6, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F6B6, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F926, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F926, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F926, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F937, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F937, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F937, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F938, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F938, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F938, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F939, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F939, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F939, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93C, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93C, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93D, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93D, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93D, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93E, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93E, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F93E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B8, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B8, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B8, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B9, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B9, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9B9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CD, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CE, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9CF, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D6, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D6, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D6, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D7, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D7, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D7, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D8, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D8, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D8, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D9, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D9, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9D9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DA, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DA, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DA, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DB, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DC, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DD, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DE, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DE, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F9DF, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0x01F9DF, 0x00200D, 0x002642, 0x00FE0F, 0,
+ 0x01F468, 0x00200D, 0x01F9B0, 0,
+ 0x01F468, 0x00200D, 0x01F9B1, 0,
+ 0x01F468, 0x00200D, 0x01F9B2, 0,
+ 0x01F468, 0x00200D, 0x01F9B3, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B0, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B1, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B2, 0,
+ 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B3, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B0, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B1, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B2, 0,
+ 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B3, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B0, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B1, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B2, 0,
+ 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B3, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B0, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B1, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B2, 0,
+ 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B3, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B0, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B1, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B2, 0,
+ 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B3, 0,
+ 0x01F469, 0x00200D, 0x01F9B0, 0,
+ 0x01F469, 0x00200D, 0x01F9B1, 0,
+ 0x01F469, 0x00200D, 0x01F9B2, 0,
+ 0x01F469, 0x00200D, 0x01F9B3, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B0, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B1, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B2, 0,
+ 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B3, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B0, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B1, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B2, 0,
+ 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B3, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B0, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B1, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B2, 0,
+ 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B3, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B0, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B1, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B2, 0,
+ 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B3, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B0, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B1, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B2, 0,
+ 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B3, 0,
+ 0x01F3F3, 0x00FE0F, 0x00200D, 0x01F308, 0,
+ 0x01F3F4, 0x00200D, 0x002620, 0x00FE0F, 0,
+ 0x01F415, 0x00200D, 0x01F9BA, 0,
+ 0x01F482, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0,
+ 0 // null-terminating the list
+};
+// clang-format on
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_INTL_SUPPORT
diff --git a/js/src/irregexp/imported/property-sequences.h b/js/src/irregexp/imported/property-sequences.h
new file mode 100644
index 0000000000..9b3a188865
--- /dev/null
+++ b/js/src/irregexp/imported/property-sequences.h
@@ -0,0 +1,27 @@
+// Copyright 2018 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_PROPERTY_SEQUENCES_H_
+#define V8_REGEXP_PROPERTY_SEQUENCES_H_
+
+#ifdef V8_INTL_SUPPORT
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class UnicodePropertySequences : public AllStatic {
+ public:
+ static const base::uc32 kEmojiFlagSequences[];
+ static const base::uc32 kEmojiTagSequences[];
+ static const base::uc32 kEmojiZWJSequences[];
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_INTL_SUPPORT
+
+#endif // V8_REGEXP_PROPERTY_SEQUENCES_H_
diff --git a/js/src/irregexp/imported/regexp-ast.cc b/js/src/irregexp/imported/regexp-ast.cc
new file mode 100644
index 0000000000..63eeb5c05d
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-ast.cc
@@ -0,0 +1,432 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-ast.h"
+
+
+namespace v8 {
+namespace internal {
+
+#define MAKE_ACCEPT(Name) \
+ void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
+ return visitor->Visit##Name(this, data); \
+ }
+FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
+#undef MAKE_ACCEPT
+
+#define MAKE_TYPE_CASE(Name) \
+ RegExp##Name* RegExpTree::As##Name() { return nullptr; } \
+ bool RegExpTree::Is##Name() { return false; }
+FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
+#undef MAKE_TYPE_CASE
+
+#define MAKE_TYPE_CASE(Name) \
+ RegExp##Name* RegExp##Name::As##Name() { return this; } \
+ bool RegExp##Name::Is##Name() { return true; }
+FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
+#undef MAKE_TYPE_CASE
+
+namespace {
+
+Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
+ Interval result = Interval::Empty();
+ for (int i = 0; i < children->length(); i++)
+ result = result.Union(children->at(i)->CaptureRegisters());
+ return result;
+}
+
+} // namespace
+
+Interval RegExpAlternative::CaptureRegisters() {
+ return ListCaptureRegisters(nodes());
+}
+
+
+Interval RegExpDisjunction::CaptureRegisters() {
+ return ListCaptureRegisters(alternatives());
+}
+
+
+Interval RegExpLookaround::CaptureRegisters() {
+ return body()->CaptureRegisters();
+}
+
+
+Interval RegExpCapture::CaptureRegisters() {
+ Interval self(StartRegister(index()), EndRegister(index()));
+ return self.Union(body()->CaptureRegisters());
+}
+
+
+Interval RegExpQuantifier::CaptureRegisters() {
+ return body()->CaptureRegisters();
+}
+
+
+bool RegExpAssertion::IsAnchoredAtStart() {
+ return assertion_type() == RegExpAssertion::Type::START_OF_INPUT;
+}
+
+
+bool RegExpAssertion::IsAnchoredAtEnd() {
+ return assertion_type() == RegExpAssertion::Type::END_OF_INPUT;
+}
+
+
+bool RegExpAlternative::IsAnchoredAtStart() {
+ ZoneList<RegExpTree*>* nodes = this->nodes();
+ for (int i = 0; i < nodes->length(); i++) {
+ RegExpTree* node = nodes->at(i);
+ if (node->IsAnchoredAtStart()) {
+ return true;
+ }
+ if (node->max_match() > 0) {
+ return false;
+ }
+ }
+ return false;
+}
+
+
+bool RegExpAlternative::IsAnchoredAtEnd() {
+ ZoneList<RegExpTree*>* nodes = this->nodes();
+ for (int i = nodes->length() - 1; i >= 0; i--) {
+ RegExpTree* node = nodes->at(i);
+ if (node->IsAnchoredAtEnd()) {
+ return true;
+ }
+ if (node->max_match() > 0) {
+ return false;
+ }
+ }
+ return false;
+}
+
+
+bool RegExpDisjunction::IsAnchoredAtStart() {
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ for (int i = 0; i < alternatives->length(); i++) {
+ if (!alternatives->at(i)->IsAnchoredAtStart()) return false;
+ }
+ return true;
+}
+
+
+bool RegExpDisjunction::IsAnchoredAtEnd() {
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ for (int i = 0; i < alternatives->length(); i++) {
+ if (!alternatives->at(i)->IsAnchoredAtEnd()) return false;
+ }
+ return true;
+}
+
+
+bool RegExpLookaround::IsAnchoredAtStart() {
+ return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
+}
+
+
+bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
+
+
+bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
+
+namespace {
+
+// Convert regular expression trees to a simple sexp representation.
+// This representation should be different from the input grammar
+// in as many cases as possible, to make it more difficult for incorrect
+// parses to look as correct ones which is likely if the input and
+// output formats are alike.
+class RegExpUnparser final : public RegExpVisitor {
+ public:
+ RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
+ void VisitCharacterRange(CharacterRange that);
+#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
+ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
+#undef MAKE_CASE
+ private:
+ std::ostream& os_;
+ Zone* zone_;
+};
+
+} // namespace
+
+void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
+ os_ << "(|";
+ for (int i = 0; i < that->alternatives()->length(); i++) {
+ os_ << " ";
+ that->alternatives()->at(i)->Accept(this, data);
+ }
+ os_ << ")";
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
+ os_ << "(:";
+ for (int i = 0; i < that->nodes()->length(); i++) {
+ os_ << " ";
+ that->nodes()->at(i)->Accept(this, data);
+ }
+ os_ << ")";
+ return nullptr;
+}
+
+
+void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
+ os_ << AsUC32(that.from());
+ if (!that.IsSingleton()) {
+ os_ << "-" << AsUC32(that.to());
+ }
+}
+
+void* RegExpUnparser::VisitClassRanges(RegExpClassRanges* that, void* data) {
+ if (that->is_negated()) os_ << "^";
+ os_ << "[";
+ for (int i = 0; i < that->ranges(zone_)->length(); i++) {
+ if (i > 0) os_ << " ";
+ VisitCharacterRange(that->ranges(zone_)->at(i));
+ }
+ os_ << "]";
+ return nullptr;
+}
+
+void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that,
+ void* data) {
+ os_ << "![";
+ for (int i = 0; i < that->ranges()->length(); i++) {
+ if (i > 0) os_ << " ";
+ VisitCharacterRange(that->ranges()->at(i));
+ }
+ if (that->has_strings()) {
+ for (auto iter : *that->strings()) {
+ os_ << " '";
+ os_ << std::string(iter.first.begin(), iter.first.end());
+ os_ << "'";
+ }
+ }
+ os_ << "]";
+ return nullptr;
+}
+
+void* RegExpUnparser::VisitClassSetExpression(RegExpClassSetExpression* that,
+ void* data) {
+ switch (that->operation()) {
+ case RegExpClassSetExpression::OperationType::kUnion:
+ os_ << "++";
+ break;
+ case RegExpClassSetExpression::OperationType::kIntersection:
+ os_ << "&&";
+ break;
+ case RegExpClassSetExpression::OperationType::kSubtraction:
+ os_ << "--";
+ break;
+ }
+ if (that->is_negated()) os_ << "^";
+ os_ << "[";
+ for (int i = 0; i < that->operands()->length(); i++) {
+ if (i > 0) os_ << " ";
+ that->operands()->at(i)->Accept(this, data);
+ }
+ os_ << "]";
+ return nullptr;
+}
+
+void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
+ switch (that->assertion_type()) {
+ case RegExpAssertion::Type::START_OF_INPUT:
+ os_ << "@^i";
+ break;
+ case RegExpAssertion::Type::END_OF_INPUT:
+ os_ << "@$i";
+ break;
+ case RegExpAssertion::Type::START_OF_LINE:
+ os_ << "@^l";
+ break;
+ case RegExpAssertion::Type::END_OF_LINE:
+ os_ << "@$l";
+ break;
+ case RegExpAssertion::Type::BOUNDARY:
+ os_ << "@b";
+ break;
+ case RegExpAssertion::Type::NON_BOUNDARY:
+ os_ << "@B";
+ break;
+ }
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
+ os_ << "'";
+ base::Vector<const base::uc16> chardata = that->data();
+ for (int i = 0; i < chardata.length(); i++) {
+ os_ << AsUC16(chardata[i]);
+ }
+ os_ << "'";
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
+ if (that->elements()->length() == 1) {
+ that->elements()->at(0).tree()->Accept(this, data);
+ } else {
+ os_ << "(!";
+ for (int i = 0; i < that->elements()->length(); i++) {
+ os_ << " ";
+ that->elements()->at(i).tree()->Accept(this, data);
+ }
+ os_ << ")";
+ }
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
+ os_ << "(# " << that->min() << " ";
+ if (that->max() == RegExpTree::kInfinity) {
+ os_ << "- ";
+ } else {
+ os_ << that->max() << " ";
+ }
+ os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
+ that->body()->Accept(this, data);
+ os_ << ")";
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
+ os_ << "(^ ";
+ that->body()->Accept(this, data);
+ os_ << ")";
+ return nullptr;
+}
+
+void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) {
+ os_ << "(?: ";
+ that->body()->Accept(this, data);
+ os_ << ")";
+ return nullptr;
+}
+
+void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
+ os_ << "(";
+ os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
+ os_ << (that->is_positive() ? " + " : " - ");
+ that->body()->Accept(this, data);
+ os_ << ")";
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
+ void* data) {
+ os_ << "(<- " << that->index() << ")";
+ return nullptr;
+}
+
+
+void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
+ os_ << '%';
+ return nullptr;
+}
+
+std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) {
+ RegExpUnparser unparser(os, zone);
+ Accept(&unparser, nullptr);
+ return os;
+}
+
+RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
+ : alternatives_(alternatives) {
+ DCHECK_LT(1, alternatives->length());
+ RegExpTree* first_alternative = alternatives->at(0);
+ min_match_ = first_alternative->min_match();
+ max_match_ = first_alternative->max_match();
+ for (int i = 1; i < alternatives->length(); i++) {
+ RegExpTree* alternative = alternatives->at(i);
+ min_match_ = std::min(min_match_, alternative->min_match());
+ max_match_ = std::max(max_match_, alternative->max_match());
+ }
+}
+
+namespace {
+
+int IncreaseBy(int previous, int increase) {
+ if (RegExpTree::kInfinity - previous < increase) {
+ return RegExpTree::kInfinity;
+ } else {
+ return previous + increase;
+ }
+}
+
+} // namespace
+
+RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
+ : nodes_(nodes) {
+ DCHECK_LT(1, nodes->length());
+ min_match_ = 0;
+ max_match_ = 0;
+ for (int i = 0; i < nodes->length(); i++) {
+ RegExpTree* node = nodes->at(i);
+ int node_min_match = node->min_match();
+ min_match_ = IncreaseBy(min_match_, node_min_match);
+ int node_max_match = node->max_match();
+ max_match_ = IncreaseBy(max_match_, node_max_match);
+ }
+}
+
+RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings)
+ : ranges_(ranges), strings_(strings) {
+ DCHECK_NOT_NULL(ranges);
+ min_match_ = 0;
+ max_match_ = 0;
+ if (!ranges->is_empty()) {
+ min_match_ = 1;
+ max_match_ = 2;
+ }
+ if (has_strings()) {
+ for (auto string : *strings) {
+ min_match_ = std::min(min_match_, string.second->min_match());
+ max_match_ = std::max(max_match_, string.second->max_match());
+ }
+ }
+}
+
+RegExpClassSetExpression::RegExpClassSetExpression(
+ OperationType op, bool is_negated, bool may_contain_strings,
+ ZoneList<RegExpTree*>* operands)
+ : operation_(op),
+ is_negated_(is_negated),
+ may_contain_strings_(may_contain_strings),
+ operands_(operands) {
+ DCHECK_NOT_NULL(operands);
+ DCHECK_IMPLIES(is_negated_, !may_contain_strings_);
+ max_match_ = 0;
+ for (auto op : *operands) {
+ max_match_ = std::max(max_match_, op->max_match());
+ }
+}
+
+// static
+RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone,
+ bool is_negated) {
+ ZoneList<CharacterRange>* ranges =
+ zone->template New<ZoneList<CharacterRange>>(0, zone);
+ RegExpClassSetOperand* op =
+ zone->template New<RegExpClassSetOperand>(ranges, nullptr);
+ ZoneList<RegExpTree*>* operands =
+ zone->template New<ZoneList<RegExpTree*>>(1, zone);
+ operands->Add(op, zone);
+ return zone->template New<RegExpClassSetExpression>(
+ RegExpClassSetExpression::OperationType::kUnion, is_negated, false,
+ operands);
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-ast.h b/js/src/irregexp/imported/regexp-ast.h
new file mode 100644
index 0000000000..997282e519
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-ast.h
@@ -0,0 +1,735 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_AST_H_
+#define V8_REGEXP_REGEXP_AST_H_
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uniset.h"
+#endif // V8_INTL_SUPPORT
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
+ VISIT(Disjunction) \
+ VISIT(Alternative) \
+ VISIT(Assertion) \
+ VISIT(ClassRanges) \
+ VISIT(ClassSetOperand) \
+ VISIT(ClassSetExpression) \
+ VISIT(Atom) \
+ VISIT(Quantifier) \
+ VISIT(Capture) \
+ VISIT(Group) \
+ VISIT(Lookaround) \
+ VISIT(BackReference) \
+ VISIT(Empty) \
+ VISIT(Text)
+
+#define FORWARD_DECLARE(Name) class RegExp##Name;
+FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
+#undef FORWARD_DECLARE
+
+class RegExpCompiler;
+class RegExpNode;
+class RegExpTree;
+
+class RegExpVisitor {
+ public:
+ virtual ~RegExpVisitor() = default;
+#define MAKE_CASE(Name) \
+ virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
+ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
+#undef MAKE_CASE
+};
+
+// A simple closed interval.
+class Interval {
+ public:
+ Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size().
+ Interval(int from, int to) : from_(from), to_(to) {}
+ Interval Union(Interval that) {
+ if (that.from_ == kNone) return *this;
+ if (from_ == kNone) return that;
+ return Interval(std::min(from_, that.from_), std::max(to_, that.to_));
+ }
+
+ static Interval Empty() { return Interval(); }
+
+ bool Contains(int value) const { return (from_ <= value) && (value <= to_); }
+ bool is_empty() const { return from_ == kNone; }
+ int from() const { return from_; }
+ int to() const { return to_; }
+ int size() const { return to_ - from_ + 1; }
+
+ static constexpr int kNone = -1;
+
+ private:
+ int from_;
+ int to_;
+};
+
+// Named standard character sets.
+enum class StandardCharacterSet : char {
+ kWhitespace = 's', // Like /\s/.
+ kNotWhitespace = 'S', // Like /\S/.
+ kWord = 'w', // Like /\w/.
+ kNotWord = 'W', // Like /\W/.
+ kDigit = 'd', // Like /\d/.
+ kNotDigit = 'D', // Like /\D/.
+ kLineTerminator = 'n', // The inverse of /./.
+ kNotLineTerminator = '.', // Like /./.
+ kEverything = '*', // Matches every character, like /./s.
+};
+
+// Represents code points (with values up to 0x10FFFF) in the range from from_
+// to to_, both ends are inclusive.
+class CharacterRange {
+ public:
+ CharacterRange() = default;
+ // For compatibility with the CHECK_OK macro.
+ CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
+
+ static inline CharacterRange Singleton(base::uc32 value) {
+ return CharacterRange(value, value);
+ }
+ static inline CharacterRange Range(base::uc32 from, base::uc32 to) {
+ DCHECK(0 <= from && to <= kMaxCodePoint);
+ DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
+ return CharacterRange(from, to);
+ }
+ static inline CharacterRange Everything() {
+ return CharacterRange(0, kMaxCodePoint);
+ }
+
+ static inline ZoneList<CharacterRange>* List(Zone* zone,
+ CharacterRange range) {
+ ZoneList<CharacterRange>* list =
+ zone->New<ZoneList<CharacterRange>>(1, zone);
+ list->Add(range, zone);
+ return list;
+ }
+
+ // Add class escapes. Add case equivalent closure for \w and \W if necessary.
+ V8_EXPORT_PRIVATE static void AddClassEscape(
+ StandardCharacterSet standard_character_set,
+ ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
+ Zone* zone);
+ // Add case equivalents to ranges. Only used for /i, not for /ui or /vi, as
+ // the semantics for unicode mode are slightly different.
+ // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4.
+ V8_EXPORT_PRIVATE static void AddCaseEquivalents(
+ Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
+ bool is_one_byte);
+ // Add case equivalent code points to ranges. Only used for /ui and /vi, not
+ // for /i, as the semantics for non-unicode mode are slightly different.
+ // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4.
+ static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
+ Zone* zone);
+
+#ifdef V8_INTL_SUPPORT
+ // Creates the closeOver of the given UnicodeSet, removing all
+ // characters/strings that can't be derived via simple case folding.
+ static void UnicodeSimpleCloseOver(icu::UnicodeSet& set);
+#endif // V8_INTL_SUPPORT
+
+ bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
+ base::uc32 from() const { return from_; }
+ base::uc32 to() const { return to_; }
+ bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; }
+ bool IsSingleton() const { return from_ == to_; }
+
+ // Whether a range list is in canonical form: Ranges ordered by from value,
+ // and ranges non-overlapping and non-adjacent.
+ V8_EXPORT_PRIVATE static bool IsCanonical(
+ const ZoneList<CharacterRange>* ranges);
+ // Convert range list to canonical form. The characters covered by the ranges
+ // will still be the same, but no character is in more than one range, and
+ // adjacent ranges are merged. The resulting list may be shorter than the
+ // original, but cannot be longer.
+ static void Canonicalize(ZoneList<CharacterRange>* ranges);
+ // Negate the contents of a character range in canonical form.
+ static void Negate(const ZoneList<CharacterRange>* src,
+ ZoneList<CharacterRange>* dst, Zone* zone);
+ // Intersect the contents of two character ranges in canonical form.
+ static void Intersect(const ZoneList<CharacterRange>* lhs,
+ const ZoneList<CharacterRange>* rhs,
+ ZoneList<CharacterRange>* dst, Zone* zone);
+ // Subtract the contents of |to_remove| from the contents of |src|.
+ static void Subtract(const ZoneList<CharacterRange>* src,
+ const ZoneList<CharacterRange>* to_remove,
+ ZoneList<CharacterRange>* dst, Zone* zone);
+ // Remove all ranges outside the one-byte range.
+ static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
+ // Checks if two ranges (both need to be canonical) are equal.
+ static bool Equals(const ZoneList<CharacterRange>* lhs,
+ const ZoneList<CharacterRange>* rhs);
+
+ private:
+ CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
+
+ static constexpr int kMaxCodePoint = 0x10ffff;
+
+ base::uc32 from_ = 0;
+ base::uc32 to_ = 0;
+};
+
+inline bool operator==(const CharacterRange& lhs, const CharacterRange& rhs) {
+ return lhs.from() == rhs.from() && lhs.to() == rhs.to();
+}
+inline bool operator!=(const CharacterRange& lhs, const CharacterRange& rhs) {
+ return !operator==(lhs, rhs);
+}
+
+#define DECL_BOILERPLATE(Name) \
+ void* Accept(RegExpVisitor* visitor, void* data) override; \
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \
+ override; \
+ RegExp##Name* As##Name() override; \
+ bool Is##Name() override
+
+class RegExpTree : public ZoneObject {
+ public:
+ static const int kInfinity = kMaxInt;
+ virtual ~RegExpTree() = default;
+ virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
+ virtual RegExpNode* ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) = 0;
+ virtual bool IsTextElement() { return false; }
+ virtual bool IsAnchoredAtStart() { return false; }
+ virtual bool IsAnchoredAtEnd() { return false; }
+ virtual int min_match() = 0;
+ virtual int max_match() = 0;
+ // Returns the interval of registers used for captures within this
+ // expression.
+ virtual Interval CaptureRegisters() { return Interval::Empty(); }
+ virtual void AppendToText(RegExpText* text, Zone* zone);
+ V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone);
+#define MAKE_ASTYPE(Name) \
+ virtual RegExp##Name* As##Name(); \
+ virtual bool Is##Name();
+ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
+#undef MAKE_ASTYPE
+};
+
+
+class RegExpDisjunction final : public RegExpTree {
+ public:
+ explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
+
+ DECL_BOILERPLATE(Disjunction);
+
+ Interval CaptureRegisters() override;
+ bool IsAnchoredAtStart() override;
+ bool IsAnchoredAtEnd() override;
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
+ ZoneList<RegExpTree*>* alternatives() const { return alternatives_; }
+
+ private:
+ bool SortConsecutiveAtoms(RegExpCompiler* compiler);
+ void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
+ void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
+ ZoneList<RegExpTree*>* alternatives_;
+ int min_match_;
+ int max_match_;
+};
+
+
+class RegExpAlternative final : public RegExpTree {
+ public:
+ explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
+
+ DECL_BOILERPLATE(Alternative);
+
+ Interval CaptureRegisters() override;
+ bool IsAnchoredAtStart() override;
+ bool IsAnchoredAtEnd() override;
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
+ ZoneList<RegExpTree*>* nodes() const { return nodes_; }
+
+ private:
+ ZoneList<RegExpTree*>* nodes_;
+ int min_match_;
+ int max_match_;
+};
+
+
+class RegExpAssertion final : public RegExpTree {
+ public:
+ enum class Type {
+ START_OF_LINE = 0,
+ START_OF_INPUT = 1,
+ END_OF_LINE = 2,
+ END_OF_INPUT = 3,
+ BOUNDARY = 4,
+ NON_BOUNDARY = 5,
+ LAST_ASSERTION_TYPE = NON_BOUNDARY,
+ };
+ explicit RegExpAssertion(Type type) : assertion_type_(type) {}
+
+ DECL_BOILERPLATE(Assertion);
+
+ bool IsAnchoredAtStart() override;
+ bool IsAnchoredAtEnd() override;
+ int min_match() override { return 0; }
+ int max_match() override { return 0; }
+ Type assertion_type() const { return assertion_type_; }
+
+ private:
+ const Type assertion_type_;
+};
+
+class CharacterSet final {
+ public:
+ explicit CharacterSet(StandardCharacterSet standard_set_type)
+ : standard_set_type_(standard_set_type) {}
+ explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {}
+
+ ZoneList<CharacterRange>* ranges(Zone* zone);
+ StandardCharacterSet standard_set_type() const {
+ return standard_set_type_.value();
+ }
+ void set_standard_set_type(StandardCharacterSet standard_set_type) {
+ standard_set_type_ = standard_set_type;
+ }
+ bool is_standard() const { return standard_set_type_.has_value(); }
+ V8_EXPORT_PRIVATE void Canonicalize();
+
+ private:
+ ZoneList<CharacterRange>* ranges_ = nullptr;
+ base::Optional<StandardCharacterSet> standard_set_type_;
+};
+
+class RegExpClassRanges final : public RegExpTree {
+ public:
+ // NEGATED: The character class is negated and should match everything but
+ // the specified ranges.
+ // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
+ // surrogate and should not be unicode-desugared (crbug.com/641091).
+ enum Flag {
+ NEGATED = 1 << 0,
+ CONTAINS_SPLIT_SURROGATE = 1 << 1,
+ };
+ using ClassRangesFlags = base::Flags<Flag>;
+
+ RegExpClassRanges(Zone* zone, ZoneList<CharacterRange>* ranges,
+ ClassRangesFlags class_ranges_flags = ClassRangesFlags())
+ : set_(ranges), class_ranges_flags_(class_ranges_flags) {
+ // Convert the empty set of ranges to the negated Everything() range.
+ if (ranges->is_empty()) {
+ ranges->Add(CharacterRange::Everything(), zone);
+ class_ranges_flags_ ^= NEGATED;
+ }
+ }
+ explicit RegExpClassRanges(StandardCharacterSet standard_set_type)
+ : set_(standard_set_type), class_ranges_flags_() {}
+
+ DECL_BOILERPLATE(ClassRanges);
+
+ bool IsTextElement() override { return true; }
+ int min_match() override { return 1; }
+ // The character class may match two code units for unicode regexps.
+ // TODO(yangguo): we should split this class for usage in TextElement, and
+ // make max_match() dependent on the character class content.
+ int max_match() override { return 2; }
+
+ void AppendToText(RegExpText* text, Zone* zone) override;
+
+ // TODO(lrn): Remove need for complex version if is_standard that
+ // recognizes a mangled standard set and just do { return set_.is_special(); }
+ bool is_standard(Zone* zone);
+ // Returns a value representing the standard character set if is_standard()
+ // returns true.
+ StandardCharacterSet standard_type() const {
+ return set_.standard_set_type();
+ }
+
+ CharacterSet character_set() const { return set_; }
+ ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
+
+ bool is_negated() const { return (class_ranges_flags_ & NEGATED) != 0; }
+ bool contains_split_surrogate() const {
+ return (class_ranges_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
+ }
+
+ private:
+ CharacterSet set_;
+ ClassRangesFlags class_ranges_flags_;
+};
+
+struct CharacterClassStringLess {
+ bool operator()(const base::Vector<const base::uc32>& lhs,
+ const base::Vector<const base::uc32>& rhs) const {
+ // Longer strings first so we generate matches for the largest string
+ // possible.
+ if (lhs.length() != rhs.length()) {
+ return lhs.length() > rhs.length();
+ }
+ for (int i = 0; i < lhs.length(); i++) {
+ if (lhs[i] != rhs[i]) {
+ return lhs[i] < rhs[i];
+ }
+ }
+ return false;
+ }
+};
+
+// A type used for strings as part of character classes (only possible in
+// unicode sets mode).
+// We use a ZoneMap instead of an UnorderedZoneMap because we need to match
+// the longest alternatives first. By using a ZoneMap with the custom comparator
+// we can avoid sorting before assembling the code.
+// Strings are likely short (the largest string in current unicode properties
+// consists of 10 code points).
+using CharacterClassStrings = ZoneMap<base::Vector<const base::uc32>,
+ RegExpTree*, CharacterClassStringLess>;
+
+// TODO(pthier): If we are sure we don't want to use icu::UnicodeSets
+// (performance evaluation pending), this class can be merged with
+// RegExpClassRanges.
+class RegExpClassSetOperand final : public RegExpTree {
+ public:
+ RegExpClassSetOperand(ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings);
+
+ DECL_BOILERPLATE(ClassSetOperand);
+
+ bool IsTextElement() override { return true; }
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
+
+ void Union(RegExpClassSetOperand* other, Zone* zone);
+ void Intersect(RegExpClassSetOperand* other,
+ ZoneList<CharacterRange>* temp_ranges, Zone* zone);
+ void Subtract(RegExpClassSetOperand* other,
+ ZoneList<CharacterRange>* temp_ranges, Zone* zone);
+
+ bool has_strings() const { return strings_ != nullptr && !strings_->empty(); }
+ ZoneList<CharacterRange>* ranges() { return ranges_; }
+ CharacterClassStrings* strings() {
+ DCHECK_NOT_NULL(strings_);
+ return strings_;
+ }
+
+ private:
+ ZoneList<CharacterRange>* ranges_;
+ CharacterClassStrings* strings_;
+ int min_match_;
+ int max_match_;
+};
+
+class RegExpClassSetExpression final : public RegExpTree {
+ public:
+ enum class OperationType { kUnion, kIntersection, kSubtraction };
+
+ RegExpClassSetExpression(OperationType op, bool is_negated,
+ bool may_contain_strings,
+ ZoneList<RegExpTree*>* operands);
+
+ DECL_BOILERPLATE(ClassSetExpression);
+
+ // Create an empty class set expression (matches everything if |is_negated|,
+ // nothing otherwise).
+ static RegExpClassSetExpression* Empty(Zone* zone, bool is_negated);
+
+ bool IsTextElement() override { return true; }
+ int min_match() override { return 0; }
+ int max_match() override { return max_match_; }
+
+ OperationType operation() const { return operation_; }
+ bool is_negated() const { return is_negated_; }
+ bool may_contain_strings() const { return may_contain_strings_; }
+ const ZoneList<RegExpTree*>* operands() const { return operands_; }
+ ZoneList<RegExpTree*>* operands() { return operands_; }
+
+ private:
+ // Recursively evaluates the tree rooted at |root|, computing the valid
+ // CharacterRanges and strings after applying all set operations.
+ // The original tree will be modified by this method, so don't store pointers
+ // to inner nodes of the tree somewhere else!
+ // Modifying the tree in-place saves memory and speeds up multiple calls of
+ // the method (e.g. when unrolling quantifiers).
+ // |temp_ranges| is used for intermediate results, passed as parameter to
+ // avoid allocating new lists all the time.
+ static RegExpClassSetOperand* ComputeExpression(
+ RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone);
+
+ const OperationType operation_;
+ const bool is_negated_;
+ const bool may_contain_strings_;
+ ZoneList<RegExpTree*>* operands_ = nullptr;
+ int max_match_;
+};
+
+class RegExpAtom final : public RegExpTree {
+ public:
+ explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}
+
+ DECL_BOILERPLATE(Atom);
+
+ bool IsTextElement() override { return true; }
+ int min_match() override { return data_.length(); }
+ int max_match() override { return data_.length(); }
+ void AppendToText(RegExpText* text, Zone* zone) override;
+
+ base::Vector<const base::uc16> data() const { return data_; }
+ int length() const { return data_.length(); }
+
+ private:
+ base::Vector<const base::uc16> data_;
+};
+
+class TextElement final {
+ public:
+ enum TextType { ATOM, CLASS_RANGES };
+
+ static TextElement Atom(RegExpAtom* atom);
+ static TextElement ClassRanges(RegExpClassRanges* class_ranges);
+
+ int cp_offset() const { return cp_offset_; }
+ void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
+ int length() const;
+
+ TextType text_type() const { return text_type_; }
+
+ RegExpTree* tree() const { return tree_; }
+
+ RegExpAtom* atom() const {
+ DCHECK(text_type() == ATOM);
+ return reinterpret_cast<RegExpAtom*>(tree());
+ }
+
+ RegExpClassRanges* class_ranges() const {
+ DCHECK(text_type() == CLASS_RANGES);
+ return reinterpret_cast<RegExpClassRanges*>(tree());
+ }
+
+ private:
+ TextElement(TextType text_type, RegExpTree* tree)
+ : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
+
+ int cp_offset_;
+ TextType text_type_;
+ RegExpTree* tree_;
+};
+
+class RegExpText final : public RegExpTree {
+ public:
+ explicit RegExpText(Zone* zone) : elements_(2, zone) {}
+
+ DECL_BOILERPLATE(Text);
+
+ bool IsTextElement() override { return true; }
+ int min_match() override { return length_; }
+ int max_match() override { return length_; }
+ void AppendToText(RegExpText* text, Zone* zone) override;
+ void AddElement(TextElement elm, Zone* zone) {
+ elements_.Add(elm, zone);
+ length_ += elm.length();
+ }
+ ZoneList<TextElement>* elements() { return &elements_; }
+
+ private:
+ ZoneList<TextElement> elements_;
+ int length_ = 0;
+};
+
+
+class RegExpQuantifier final : public RegExpTree {
+ public:
+ enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
+ RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
+ : body_(body),
+ min_(min),
+ max_(max),
+ quantifier_type_(type) {
+ if (min > 0 && body->min_match() > kInfinity / min) {
+ min_match_ = kInfinity;
+ } else {
+ min_match_ = min * body->min_match();
+ }
+ if (max > 0 && body->max_match() > kInfinity / max) {
+ max_match_ = kInfinity;
+ } else {
+ max_match_ = max * body->max_match();
+ }
+ }
+
+ DECL_BOILERPLATE(Quantifier);
+
+ static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
+ RegExpCompiler* compiler, RegExpNode* on_success,
+ bool not_at_start = false);
+ Interval CaptureRegisters() override;
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
+ int min() const { return min_; }
+ int max() const { return max_; }
+ QuantifierType quantifier_type() const { return quantifier_type_; }
+ bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
+ bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; }
+ bool is_greedy() const { return quantifier_type_ == GREEDY; }
+ RegExpTree* body() const { return body_; }
+
+ private:
+ RegExpTree* body_;
+ int min_;
+ int max_;
+ int min_match_;
+ int max_match_;
+ QuantifierType quantifier_type_;
+};
+
+
+class RegExpCapture final : public RegExpTree {
+ public:
+ explicit RegExpCapture(int index)
+ : body_(nullptr),
+ index_(index),
+ min_match_(0),
+ max_match_(0),
+ name_(nullptr) {}
+
+ DECL_BOILERPLATE(Capture);
+
+ static RegExpNode* ToNode(RegExpTree* body, int index,
+ RegExpCompiler* compiler, RegExpNode* on_success);
+ bool IsAnchoredAtStart() override;
+ bool IsAnchoredAtEnd() override;
+ Interval CaptureRegisters() override;
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
+ RegExpTree* body() { return body_; }
+ void set_body(RegExpTree* body) {
+ body_ = body;
+ min_match_ = body->min_match();
+ max_match_ = body->max_match();
+ }
+ int index() const { return index_; }
+ const ZoneVector<base::uc16>* name() const { return name_; }
+ void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
+ static int StartRegister(int index) { return index * 2; }
+ static int EndRegister(int index) { return index * 2 + 1; }
+
+ private:
+ RegExpTree* body_ = nullptr;
+ int index_;
+ int min_match_ = 0;
+ int max_match_ = 0;
+ const ZoneVector<base::uc16>* name_ = nullptr;
+};
+
+class RegExpGroup final : public RegExpTree {
+ public:
+ explicit RegExpGroup(RegExpTree* body)
+ : body_(body),
+ min_match_(body->min_match()),
+ max_match_(body->max_match()) {}
+
+ DECL_BOILERPLATE(Group);
+
+ bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
+ bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
+ Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
+ RegExpTree* body() const { return body_; }
+
+ private:
+ RegExpTree* body_;
+ int min_match_;
+ int max_match_;
+};
+
+class RegExpLookaround final : public RegExpTree {
+ public:
+ enum Type { LOOKAHEAD, LOOKBEHIND };
+
+ RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
+ int capture_from, Type type)
+ : body_(body),
+ is_positive_(is_positive),
+ capture_count_(capture_count),
+ capture_from_(capture_from),
+ type_(type) {}
+
+ DECL_BOILERPLATE(Lookaround);
+
+ Interval CaptureRegisters() override;
+ bool IsAnchoredAtStart() override;
+ int min_match() override { return 0; }
+ int max_match() override { return 0; }
+ RegExpTree* body() const { return body_; }
+ bool is_positive() const { return is_positive_; }
+ int capture_count() const { return capture_count_; }
+ int capture_from() const { return capture_from_; }
+ Type type() const { return type_; }
+
+ class Builder {
+ public:
+ Builder(bool is_positive, RegExpNode* on_success,
+ int stack_pointer_register, int position_register,
+ int capture_register_count = 0, int capture_register_start = 0);
+ RegExpNode* on_match_success() const { return on_match_success_; }
+ RegExpNode* ForMatch(RegExpNode* match);
+
+ private:
+ bool is_positive_;
+ RegExpNode* on_match_success_;
+ RegExpNode* on_success_;
+ int stack_pointer_register_;
+ int position_register_;
+ };
+
+ private:
+ RegExpTree* body_;
+ bool is_positive_;
+ int capture_count_;
+ int capture_from_;
+ Type type_;
+};
+
+
+class RegExpBackReference final : public RegExpTree {
+ public:
+ explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {}
+ RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
+ : capture_(capture), flags_(flags) {}
+
+ DECL_BOILERPLATE(BackReference);
+
+ int min_match() override { return 0; }
+ // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
+ // recursion, we give up. Ignorance is bliss.
+ int max_match() override { return kInfinity; }
+ int index() const { return capture_->index(); }
+ RegExpCapture* capture() const { return capture_; }
+ void set_capture(RegExpCapture* capture) { capture_ = capture; }
+ const ZoneVector<base::uc16>* name() const { return name_; }
+ void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
+
+ private:
+ RegExpCapture* capture_ = nullptr;
+ const ZoneVector<base::uc16>* name_ = nullptr;
+ const RegExpFlags flags_;
+};
+
+
+class RegExpEmpty final : public RegExpTree {
+ public:
+ DECL_BOILERPLATE(Empty);
+ int min_match() override { return 0; }
+ int max_match() override { return 0; }
+};
+
+} // namespace internal
+} // namespace v8
+
+#undef DECL_BOILERPLATE
+
+#endif // V8_REGEXP_REGEXP_AST_H_
diff --git a/js/src/irregexp/imported/regexp-bytecode-generator-inl.h b/js/src/irregexp/imported/regexp-bytecode-generator-inl.h
new file mode 100644
index 0000000000..807ca66f47
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecode-generator-inl.h
@@ -0,0 +1,55 @@
+// Copyright 2008-2009 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
+#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
+
+#include "irregexp/imported/regexp-bytecode-generator.h"
+
+#include "irregexp/imported/regexp-bytecodes.h"
+
+namespace v8 {
+namespace internal {
+
+void RegExpBytecodeGenerator::Emit(uint32_t byte, uint32_t twenty_four_bits) {
+ DCHECK(is_uint24(twenty_four_bits));
+ Emit32((twenty_four_bits << BYTECODE_SHIFT) | byte);
+}
+
+void RegExpBytecodeGenerator::Emit(uint32_t byte, int32_t twenty_four_bits) {
+ DCHECK(is_int24(twenty_four_bits));
+ Emit32((static_cast<uint32_t>(twenty_four_bits) << BYTECODE_SHIFT) | byte);
+}
+
+void RegExpBytecodeGenerator::Emit16(uint32_t word) {
+ DCHECK(pc_ <= static_cast<int>(buffer_.size()));
+ if (pc_ + 1 >= static_cast<int>(buffer_.size())) {
+ ExpandBuffer();
+ }
+ *reinterpret_cast<uint16_t*>(buffer_.data() + pc_) = word;
+ pc_ += 2;
+}
+
+void RegExpBytecodeGenerator::Emit8(uint32_t word) {
+ DCHECK(pc_ <= static_cast<int>(buffer_.size()));
+ if (pc_ == static_cast<int>(buffer_.size())) {
+ ExpandBuffer();
+ }
+ *reinterpret_cast<unsigned char*>(buffer_.data() + pc_) = word;
+ pc_ += 1;
+}
+
+void RegExpBytecodeGenerator::Emit32(uint32_t word) {
+ DCHECK(pc_ <= static_cast<int>(buffer_.size()));
+ if (pc_ + 3 >= static_cast<int>(buffer_.size())) {
+ ExpandBuffer();
+ }
+ *reinterpret_cast<uint32_t*>(buffer_.data() + pc_) = word;
+ pc_ += 4;
+}
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.cc b/js/src/irregexp/imported/regexp-bytecode-generator.cc
new file mode 100644
index 0000000000..934a39130d
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecode-generator.cc
@@ -0,0 +1,405 @@
+// Copyright 2008-2009 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-bytecode-generator.h"
+
+#include "irregexp/imported/regexp-bytecode-generator-inl.h"
+#include "irregexp/imported/regexp-bytecode-peephole.h"
+#include "irregexp/imported/regexp-bytecodes.h"
+#include "irregexp/imported/regexp-macro-assembler.h"
+
+namespace v8 {
+namespace internal {
+
+RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
+ : RegExpMacroAssembler(isolate, zone),
+ buffer_(kInitialBufferSize, zone),
+ pc_(0),
+ advance_current_end_(kInvalidPC),
+ jump_edges_(zone),
+ isolate_(isolate) {}
+
+RegExpBytecodeGenerator::~RegExpBytecodeGenerator() {
+ if (backtrack_.is_linked()) backtrack_.Unuse();
+}
+
+RegExpBytecodeGenerator::IrregexpImplementation
+RegExpBytecodeGenerator::Implementation() {
+ return kBytecodeImplementation;
+}
+
+void RegExpBytecodeGenerator::Bind(Label* l) {
+ advance_current_end_ = kInvalidPC;
+ DCHECK(!l->is_bound());
+ if (l->is_linked()) {
+ int pos = l->pos();
+ while (pos != 0) {
+ int fixup = pos;
+ pos = *reinterpret_cast<int32_t*>(buffer_.data() + fixup);
+ *reinterpret_cast<uint32_t*>(buffer_.data() + fixup) = pc_;
+ jump_edges_.emplace(fixup, pc_);
+ }
+ }
+ l->bind_to(pc_);
+}
+
+void RegExpBytecodeGenerator::EmitOrLink(Label* l) {
+ if (l == nullptr) l = &backtrack_;
+ int pos = 0;
+ if (l->is_bound()) {
+ pos = l->pos();
+ jump_edges_.emplace(pc_, pos);
+ } else {
+ if (l->is_linked()) {
+ pos = l->pos();
+ }
+ l->link_to(pc_);
+ }
+ Emit32(pos);
+}
+
+void RegExpBytecodeGenerator::PopRegister(int register_index) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_POP_REGISTER, register_index);
+}
+
+void RegExpBytecodeGenerator::PushRegister(int register_index,
+ StackCheckFlag check_stack_limit) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_PUSH_REGISTER, register_index);
+}
+
+void RegExpBytecodeGenerator::WriteCurrentPositionToRegister(int register_index,
+ int cp_offset) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_SET_REGISTER_TO_CP, register_index);
+ Emit32(cp_offset); // Current position offset.
+}
+
+void RegExpBytecodeGenerator::ClearRegisters(int reg_from, int reg_to) {
+ DCHECK(reg_from <= reg_to);
+ for (int reg = reg_from; reg <= reg_to; reg++) {
+ SetRegister(reg, -1);
+ }
+}
+
+void RegExpBytecodeGenerator::ReadCurrentPositionFromRegister(
+ int register_index) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_SET_CP_TO_REGISTER, register_index);
+}
+
+void RegExpBytecodeGenerator::WriteStackPointerToRegister(int register_index) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_SET_REGISTER_TO_SP, register_index);
+}
+
+void RegExpBytecodeGenerator::ReadStackPointerFromRegister(int register_index) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_SET_SP_TO_REGISTER, register_index);
+}
+
+void RegExpBytecodeGenerator::SetCurrentPositionFromEnd(int by) {
+ DCHECK(is_uint24(by));
+ Emit(BC_SET_CURRENT_POSITION_FROM_END, by);
+}
+
+void RegExpBytecodeGenerator::SetRegister(int register_index, int to) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_SET_REGISTER, register_index);
+ Emit32(to);
+}
+
+void RegExpBytecodeGenerator::AdvanceRegister(int register_index, int by) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_ADVANCE_REGISTER, register_index);
+ Emit32(by);
+}
+
+void RegExpBytecodeGenerator::PopCurrentPosition() { Emit(BC_POP_CP, 0); }
+
+void RegExpBytecodeGenerator::PushCurrentPosition() { Emit(BC_PUSH_CP, 0); }
+
+void RegExpBytecodeGenerator::Backtrack() {
+ int error_code =
+ can_fallback() ? RegExp::RE_FALLBACK_TO_EXPERIMENTAL : RegExp::RE_FAILURE;
+ Emit(BC_POP_BT, error_code);
+}
+
+void RegExpBytecodeGenerator::GoTo(Label* l) {
+ if (advance_current_end_ == pc_) {
+ // Combine advance current and goto.
+ pc_ = advance_current_start_;
+ Emit(BC_ADVANCE_CP_AND_GOTO, advance_current_offset_);
+ EmitOrLink(l);
+ advance_current_end_ = kInvalidPC;
+ } else {
+ // Regular goto.
+ Emit(BC_GOTO, 0);
+ EmitOrLink(l);
+ }
+}
+
+void RegExpBytecodeGenerator::PushBacktrack(Label* l) {
+ Emit(BC_PUSH_BT, 0);
+ EmitOrLink(l);
+}
+
+bool RegExpBytecodeGenerator::Succeed() {
+ Emit(BC_SUCCEED, 0);
+ return false; // Restart matching for global regexp not supported.
+}
+
+void RegExpBytecodeGenerator::Fail() { Emit(BC_FAIL, 0); }
+
+void RegExpBytecodeGenerator::AdvanceCurrentPosition(int by) {
+ // TODO(chromium:1166138): Turn back into DCHECKs once the underlying issue
+ // is fixed.
+ CHECK_LE(kMinCPOffset, by);
+ CHECK_GE(kMaxCPOffset, by);
+ advance_current_start_ = pc_;
+ advance_current_offset_ = by;
+ Emit(BC_ADVANCE_CP, by);
+ advance_current_end_ = pc_;
+}
+
+void RegExpBytecodeGenerator::CheckGreedyLoop(
+ Label* on_tos_equals_current_position) {
+ Emit(BC_CHECK_GREEDY, 0);
+ EmitOrLink(on_tos_equals_current_position);
+}
+
+void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset,
+ Label* on_failure,
+ bool check_bounds,
+ int characters,
+ int eats_at_least) {
+ DCHECK_GE(eats_at_least, characters);
+ if (eats_at_least > characters && check_bounds) {
+ DCHECK(is_int24(cp_offset + eats_at_least));
+ Emit(BC_CHECK_CURRENT_POSITION, cp_offset + eats_at_least);
+ EmitOrLink(on_failure);
+ check_bounds = false; // Load below doesn't need to check.
+ }
+
+ DCHECK_LE(kMinCPOffset, cp_offset);
+ DCHECK_GE(kMaxCPOffset, cp_offset);
+ int bytecode;
+ if (check_bounds) {
+ if (characters == 4) {
+ bytecode = BC_LOAD_4_CURRENT_CHARS;
+ } else if (characters == 2) {
+ bytecode = BC_LOAD_2_CURRENT_CHARS;
+ } else {
+ DCHECK_EQ(1, characters);
+ bytecode = BC_LOAD_CURRENT_CHAR;
+ }
+ } else {
+ if (characters == 4) {
+ bytecode = BC_LOAD_4_CURRENT_CHARS_UNCHECKED;
+ } else if (characters == 2) {
+ bytecode = BC_LOAD_2_CURRENT_CHARS_UNCHECKED;
+ } else {
+ DCHECK_EQ(1, characters);
+ bytecode = BC_LOAD_CURRENT_CHAR_UNCHECKED;
+ }
+ }
+ Emit(bytecode, cp_offset);
+ if (check_bounds) EmitOrLink(on_failure);
+}
+
+void RegExpBytecodeGenerator::CheckCharacterLT(base::uc16 limit,
+ Label* on_less) {
+ Emit(BC_CHECK_LT, limit);
+ EmitOrLink(on_less);
+}
+
+void RegExpBytecodeGenerator::CheckCharacterGT(base::uc16 limit,
+ Label* on_greater) {
+ Emit(BC_CHECK_GT, limit);
+ EmitOrLink(on_greater);
+}
+
+void RegExpBytecodeGenerator::CheckCharacter(uint32_t c, Label* on_equal) {
+ if (c > MAX_FIRST_ARG) {
+ Emit(BC_CHECK_4_CHARS, 0);
+ Emit32(c);
+ } else {
+ Emit(BC_CHECK_CHAR, c);
+ }
+ EmitOrLink(on_equal);
+}
+
+void RegExpBytecodeGenerator::CheckAtStart(int cp_offset, Label* on_at_start) {
+ Emit(BC_CHECK_AT_START, cp_offset);
+ EmitOrLink(on_at_start);
+}
+
+void RegExpBytecodeGenerator::CheckNotAtStart(int cp_offset,
+ Label* on_not_at_start) {
+ Emit(BC_CHECK_NOT_AT_START, cp_offset);
+ EmitOrLink(on_not_at_start);
+}
+
+void RegExpBytecodeGenerator::CheckNotCharacter(uint32_t c,
+ Label* on_not_equal) {
+ if (c > MAX_FIRST_ARG) {
+ Emit(BC_CHECK_NOT_4_CHARS, 0);
+ Emit32(c);
+ } else {
+ Emit(BC_CHECK_NOT_CHAR, c);
+ }
+ EmitOrLink(on_not_equal);
+}
+
+void RegExpBytecodeGenerator::CheckCharacterAfterAnd(uint32_t c, uint32_t mask,
+ Label* on_equal) {
+ if (c > MAX_FIRST_ARG) {
+ Emit(BC_AND_CHECK_4_CHARS, 0);
+ Emit32(c);
+ } else {
+ Emit(BC_AND_CHECK_CHAR, c);
+ }
+ Emit32(mask);
+ EmitOrLink(on_equal);
+}
+
+void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c,
+ uint32_t mask,
+ Label* on_not_equal) {
+ if (c > MAX_FIRST_ARG) {
+ Emit(BC_AND_CHECK_NOT_4_CHARS, 0);
+ Emit32(c);
+ } else {
+ Emit(BC_AND_CHECK_NOT_CHAR, c);
+ }
+ Emit32(mask);
+ EmitOrLink(on_not_equal);
+}
+
+void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd(
+ base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) {
+ Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c);
+ Emit16(minus);
+ Emit16(mask);
+ EmitOrLink(on_not_equal);
+}
+
+void RegExpBytecodeGenerator::CheckCharacterInRange(base::uc16 from,
+ base::uc16 to,
+ Label* on_in_range) {
+ Emit(BC_CHECK_CHAR_IN_RANGE, 0);
+ Emit16(from);
+ Emit16(to);
+ EmitOrLink(on_in_range);
+}
+
+void RegExpBytecodeGenerator::CheckCharacterNotInRange(base::uc16 from,
+ base::uc16 to,
+ Label* on_not_in_range) {
+ Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0);
+ Emit16(from);
+ Emit16(to);
+ EmitOrLink(on_not_in_range);
+}
+
+void RegExpBytecodeGenerator::CheckBitInTable(Handle<ByteArray> table,
+ Label* on_bit_set) {
+ Emit(BC_CHECK_BIT_IN_TABLE, 0);
+ EmitOrLink(on_bit_set);
+ for (int i = 0; i < kTableSize; i += kBitsPerByte) {
+ int byte = 0;
+ for (int j = 0; j < kBitsPerByte; j++) {
+ if (table->get(i + j) != 0) byte |= 1 << j;
+ }
+ Emit8(byte);
+ }
+}
+
+void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg,
+ bool read_backward,
+ Label* on_not_equal) {
+ DCHECK_LE(0, start_reg);
+ DCHECK_GE(kMaxRegister, start_reg);
+ Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF,
+ start_reg);
+ EmitOrLink(on_not_equal);
+}
+
+void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase(
+ int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
+ DCHECK_LE(0, start_reg);
+ DCHECK_GE(kMaxRegister, start_reg);
+ Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
+ : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
+ : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
+ : BC_CHECK_NOT_BACK_REF_NO_CASE),
+ start_reg);
+ EmitOrLink(on_not_equal);
+}
+
+void RegExpBytecodeGenerator::IfRegisterLT(int register_index, int comparand,
+ Label* on_less_than) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_CHECK_REGISTER_LT, register_index);
+ Emit32(comparand);
+ EmitOrLink(on_less_than);
+}
+
+void RegExpBytecodeGenerator::IfRegisterGE(int register_index, int comparand,
+ Label* on_greater_or_equal) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_CHECK_REGISTER_GE, register_index);
+ Emit32(comparand);
+ EmitOrLink(on_greater_or_equal);
+}
+
+void RegExpBytecodeGenerator::IfRegisterEqPos(int register_index,
+ Label* on_eq) {
+ DCHECK_LE(0, register_index);
+ DCHECK_GE(kMaxRegister, register_index);
+ Emit(BC_CHECK_REGISTER_EQ_POS, register_index);
+ EmitOrLink(on_eq);
+}
+
+Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
+ Bind(&backtrack_);
+ Backtrack();
+
+ Handle<ByteArray> array;
+ if (v8_flags.regexp_peephole_optimization) {
+ array = RegExpBytecodePeepholeOptimization::OptimizeBytecode(
+ isolate_, zone(), source, buffer_.data(), length(), jump_edges_);
+ } else {
+ array = isolate_->factory()->NewByteArray(length());
+ Copy(array->GetDataStartAddress());
+ }
+
+ return array;
+}
+
+int RegExpBytecodeGenerator::length() { return pc_; }
+
+void RegExpBytecodeGenerator::Copy(byte* a) {
+ MemCopy(a, buffer_.data(), length());
+}
+
+void RegExpBytecodeGenerator::ExpandBuffer() {
+ // TODO(jgruber): The growth strategy could be smarter for large sizes.
+ // TODO(jgruber): It's not necessary to default-initialize new elements.
+ buffer_.resize(buffer_.size() * 2);
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.h b/js/src/irregexp/imported/regexp-bytecode-generator.h
new file mode 100644
index 0000000000..351f6e0cc6
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecode-generator.h
@@ -0,0 +1,140 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
+#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
+
+#include "irregexp/imported/regexp-macro-assembler.h"
+
+namespace v8 {
+namespace internal {
+
+// An assembler/generator for the Irregexp byte code.
+class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
+ public:
+ // Create an assembler. Instructions and relocation information are emitted
+ // into a buffer, with the instructions starting from the beginning and the
+ // relocation information starting from the end of the buffer. See CodeDesc
+ // for a detailed comment on the layout (globals.h).
+ //
+ // The assembler allocates and grows its own buffer, and buffer_size
+ // determines the initial buffer size. The buffer is owned by the assembler
+ // and deallocated upon destruction of the assembler.
+ RegExpBytecodeGenerator(Isolate* isolate, Zone* zone);
+ ~RegExpBytecodeGenerator() override;
+ // The byte-code interpreter checks on each push anyway.
+ int stack_limit_slack() override { return 1; }
+ bool CanReadUnaligned() const override { return false; }
+ void Bind(Label* label) override;
+ void AdvanceCurrentPosition(int by) override; // Signed cp change.
+ void PopCurrentPosition() override;
+ void PushCurrentPosition() override;
+ void Backtrack() override;
+ void GoTo(Label* label) override;
+ void PushBacktrack(Label* label) override;
+ bool Succeed() override;
+ void Fail() override;
+ void PopRegister(int register_index) override;
+ void PushRegister(int register_index,
+ StackCheckFlag check_stack_limit) override;
+ void AdvanceRegister(int reg, int by) override; // r[reg] += by.
+ void SetCurrentPositionFromEnd(int by) override;
+ void SetRegister(int register_index, int to) override;
+ void WriteCurrentPositionToRegister(int reg, int cp_offset) override;
+ void ClearRegisters(int reg_from, int reg_to) override;
+ void ReadCurrentPositionFromRegister(int reg) override;
+ void WriteStackPointerToRegister(int reg) override;
+ void ReadStackPointerFromRegister(int reg) override;
+ void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
+ bool check_bounds, int characters,
+ int eats_at_least) override;
+ void CheckCharacter(unsigned c, Label* on_equal) override;
+ void CheckCharacterAfterAnd(unsigned c, unsigned mask,
+ Label* on_equal) override;
+ void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
+ void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
+ void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
+ void CheckAtStart(int cp_offset, Label* on_at_start) override;
+ void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
+ void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
+ void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
+ Label* on_not_equal) override;
+ void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
+ base::uc16 mask,
+ Label* on_not_equal) override;
+ void CheckCharacterInRange(base::uc16 from, base::uc16 to,
+ Label* on_in_range) override;
+ void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
+ Label* on_not_in_range) override;
+ bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
+ Label* on_in_range) override {
+ // Disabled in the interpreter, because 1) there is no constant pool that
+ // could store the ByteArray pointer, 2) bytecode size limits are not as
+ // restrictive as code (e.g. branch distances on arm), 3) bytecode for
+ // large character classes is already quite compact.
+ // TODO(jgruber): Consider using BytecodeArrays (with a constant pool)
+ // instead of plain ByteArrays; then we could implement
+ // CheckCharacterInRangeArray in the interpreter.
+ return false;
+ }
+ bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
+ Label* on_not_in_range) override {
+ return false;
+ }
+ void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
+ void CheckNotBackReference(int start_reg, bool read_backward,
+ Label* on_no_match) override;
+ void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
+ bool unicode,
+ Label* on_no_match) override;
+ void IfRegisterLT(int register_index, int comparand, Label* if_lt) override;
+ void IfRegisterGE(int register_index, int comparand, Label* if_ge) override;
+ void IfRegisterEqPos(int register_index, Label* if_eq) override;
+
+ IrregexpImplementation Implementation() override;
+ Handle<HeapObject> GetCode(Handle<String> source) override;
+
+ private:
+ void ExpandBuffer();
+
+ // Code and bitmap emission.
+ inline void EmitOrLink(Label* label);
+ inline void Emit32(uint32_t x);
+ inline void Emit16(uint32_t x);
+ inline void Emit8(uint32_t x);
+ inline void Emit(uint32_t bc, uint32_t arg);
+ inline void Emit(uint32_t bc, int32_t arg);
+ // Bytecode buffer.
+ int length();
+ void Copy(byte* a);
+
+ // The buffer into which code and relocation info are generated.
+ static constexpr int kInitialBufferSize = 1024;
+ ZoneVector<byte> buffer_;
+
+ // The program counter.
+ int pc_;
+ Label backtrack_;
+
+ int advance_current_start_;
+ int advance_current_offset_;
+ int advance_current_end_;
+
+ // Stores jump edges emitted for the bytecode (used by
+ // RegExpBytecodePeepholeOptimization).
+ // Key: jump source (offset in buffer_ where jump destination is stored).
+ // Value: jump destination (offset in buffer_ to jump to).
+ ZoneUnorderedMap<int, int> jump_edges_;
+
+ Isolate* isolate_;
+
+ static const int kInvalidPC = -1;
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodeGenerator);
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.cc b/js/src/irregexp/imported/regexp-bytecode-peephole.cc
new file mode 100644
index 0000000000..9e49bfbeca
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecode-peephole.cc
@@ -0,0 +1,1027 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-bytecode-peephole.h"
+
+#include "irregexp/imported/regexp-bytecodes.h"
+
+namespace v8 {
+namespace internal {
+
+namespace {
+
+struct BytecodeArgument {
+ int offset;
+ int length;
+
+ BytecodeArgument(int offset, int length) : offset(offset), length(length) {}
+};
+
+struct BytecodeArgumentMapping : BytecodeArgument {
+ int new_length;
+
+ BytecodeArgumentMapping(int offset, int length, int new_length)
+ : BytecodeArgument(offset, length), new_length(new_length) {}
+};
+
+struct BytecodeArgumentCheck : BytecodeArgument {
+ enum CheckType { kCheckAddress = 0, kCheckValue };
+ CheckType type;
+ int check_offset;
+ int check_length;
+
+ BytecodeArgumentCheck(int offset, int length, int check_offset)
+ : BytecodeArgument(offset, length),
+ type(kCheckAddress),
+ check_offset(check_offset) {}
+ BytecodeArgumentCheck(int offset, int length, int check_offset,
+ int check_length)
+ : BytecodeArgument(offset, length),
+ type(kCheckValue),
+ check_offset(check_offset),
+ check_length(check_length) {}
+};
+
+// Trie-Node for storing bytecode sequences we want to optimize.
+class BytecodeSequenceNode {
+ public:
+ // Dummy bytecode used when we need to store/return a bytecode but it's not a
+ // valid bytecode in the current context.
+ static constexpr int kDummyBytecode = -1;
+
+ BytecodeSequenceNode(int bytecode, Zone* zone);
+ // Adds a new node as child of the current node if it isn't a child already.
+ BytecodeSequenceNode& FollowedBy(int bytecode);
+ // Marks the end of a sequence and sets optimized bytecode to replace all
+ // bytecodes of the sequence with.
+ BytecodeSequenceNode& ReplaceWith(int bytecode);
+ // Maps arguments of bytecodes in the sequence to the optimized bytecode.
+ // Order of invocation determines order of arguments in the optimized
+ // bytecode.
+ // Invoking this method is only allowed on nodes that mark the end of a valid
+ // sequence (i.e. after ReplaceWith()).
+ // bytecode_index_in_sequence: Zero-based index of the referred bytecode
+ // within the sequence (e.g. the bytecode passed to CreateSequence() has
+ // index 0).
+ // argument_offset: Zero-based offset to the argument within the bytecode
+ // (e.g. the first argument that's not packed with the bytecode has offset 4).
+ // argument_byte_length: Length of the argument.
+ // new_argument_byte_length: Length of the argument in the new bytecode
+ // (= argument_byte_length if omitted).
+ BytecodeSequenceNode& MapArgument(int bytecode_index_in_sequence,
+ int argument_offset,
+ int argument_byte_length,
+ int new_argument_byte_length = 0);
+ // Adds a check to the sequence node making it only a valid sequence when the
+ // argument of the current bytecode at the specified offset matches the offset
+ // to check against.
+ // argument_offset: Zero-based offset to the argument within the bytecode
+ // (e.g. the first argument that's not packed with the bytecode has offset 4).
+ // argument_byte_length: Length of the argument.
+ // check_byte_offset: Zero-based offset relative to the beginning of the
+ // sequence that needs to match the value given by argument_offset. (e.g.
+ // check_byte_offset 0 matches the address of the first bytecode in the
+ // sequence).
+ BytecodeSequenceNode& IfArgumentEqualsOffset(int argument_offset,
+ int argument_byte_length,
+ int check_byte_offset);
+ // Adds a check to the sequence node making it only a valid sequence when the
+ // argument of the current bytecode at the specified offset matches the
+ // argument of another bytecode in the sequence.
+ // This is similar to IfArgumentEqualsOffset, except that this method matches
+ // the values of both arguments.
+ BytecodeSequenceNode& IfArgumentEqualsValueAtOffset(
+ int argument_offset, int argument_byte_length,
+ int other_bytecode_index_in_sequence, int other_argument_offset,
+ int other_argument_byte_length);
+ // Marks an argument as unused.
+ // All arguments that are not mapped explicitly have to be marked as unused.
+ // bytecode_index_in_sequence: Zero-based index of the referred bytecode
+ // within the sequence (e.g. the bytecode passed to CreateSequence() has
+ // index 0).
+ // argument_offset: Zero-based offset to the argument within the bytecode
+ // (e.g. the first argument that's not packed with the bytecode has offset 4).
+ // argument_byte_length: Length of the argument.
+ BytecodeSequenceNode& IgnoreArgument(int bytecode_index_in_sequence,
+ int argument_offset,
+ int argument_byte_length);
+ // Checks if the current node is valid for the sequence. I.e. all conditions
+ // set by IfArgumentEqualsOffset and IfArgumentEquals are fulfilled by this
+ // node for the actual bytecode sequence.
+ bool CheckArguments(const byte* bytecode, int pc);
+ // Returns whether this node marks the end of a valid sequence (i.e. can be
+ // replaced with an optimized bytecode).
+ bool IsSequence() const;
+ // Returns the length of the sequence in bytes.
+ int SequenceLength() const;
+ // Returns the optimized bytecode for the node or kDummyBytecode if it is not
+ // the end of a valid sequence.
+ int OptimizedBytecode() const;
+ // Returns the child of the current node matching the given bytecode or
+ // nullptr if no such child is found.
+ BytecodeSequenceNode* Find(int bytecode) const;
+ // Returns number of arguments mapped to the current node.
+ // Invoking this method is only allowed on nodes that mark the end of a valid
+ // sequence (i.e. if IsSequence())
+ size_t ArgumentSize() const;
+ // Returns the argument-mapping of the argument at index.
+ // Invoking this method is only allowed on nodes that mark the end of a valid
+ // sequence (i.e. if IsSequence())
+ BytecodeArgumentMapping ArgumentMapping(size_t index) const;
+ // Returns an iterator to begin of ignored arguments.
+ // Invoking this method is only allowed on nodes that mark the end of a valid
+ // sequence (i.e. if IsSequence())
+ ZoneLinkedList<BytecodeArgument>::iterator ArgumentIgnoredBegin() const;
+ // Returns an iterator to end of ignored arguments.
+ // Invoking this method is only allowed on nodes that mark the end of a valid
+ // sequence (i.e. if IsSequence())
+ ZoneLinkedList<BytecodeArgument>::iterator ArgumentIgnoredEnd() const;
+ // Returns whether the current node has ignored argument or not.
+ bool HasIgnoredArguments() const;
+
+ private:
+ // Returns a node in the sequence specified by its index within the sequence.
+ BytecodeSequenceNode& GetNodeByIndexInSequence(int index_in_sequence);
+ Zone* zone() const;
+
+ int bytecode_;
+ int bytecode_replacement_;
+ int index_in_sequence_;
+ int start_offset_;
+ BytecodeSequenceNode* parent_;
+ ZoneUnorderedMap<int, BytecodeSequenceNode*> children_;
+ ZoneVector<BytecodeArgumentMapping>* argument_mapping_;
+ ZoneLinkedList<BytecodeArgumentCheck>* argument_check_;
+ ZoneLinkedList<BytecodeArgument>* argument_ignored_;
+
+ Zone* zone_;
+};
+
+// These definitions are here in order to please the linker, which in debug mode
+// sometimes requires static constants to be defined in .cc files.
+constexpr int BytecodeSequenceNode::kDummyBytecode;
+
+class RegExpBytecodePeephole {
+ public:
+ RegExpBytecodePeephole(Zone* zone, size_t buffer_size,
+ const ZoneUnorderedMap<int, int>& jump_edges);
+
+ // Parses bytecode and fills the internal buffer with the potentially
+ // optimized bytecode. Returns true when optimizations were performed, false
+ // otherwise.
+ bool OptimizeBytecode(const byte* bytecode, int length);
+ // Copies the internal bytecode buffer to another buffer. The caller is
+ // responsible for allocating/freeing the memory.
+ void CopyOptimizedBytecode(byte* to_address) const;
+ int Length() const;
+
+ private:
+ // Sets up all sequences that are going to be used.
+ void DefineStandardSequences();
+ // Starts a new bytecode sequence.
+ BytecodeSequenceNode& CreateSequence(int bytecode);
+ // Checks for optimization candidates at pc and emits optimized bytecode to
+ // the internal buffer. Returns the length of replaced bytecodes in bytes.
+ int TryOptimizeSequence(const byte* bytecode, int bytecode_length,
+ int start_pc);
+ // Emits optimized bytecode to the internal buffer. start_pc points to the
+ // start of the sequence in bytecode and last_node is the last
+ // BytecodeSequenceNode of the matching sequence found.
+ void EmitOptimization(int start_pc, const byte* bytecode,
+ const BytecodeSequenceNode& last_node);
+ // Adds a relative jump source fixup at pos.
+ // Jump source fixups are used to find offsets in the new bytecode that
+ // contain jump sources.
+ void AddJumpSourceFixup(int fixup, int pos);
+ // Adds a relative jump destination fixup at pos.
+ // Jump destination fixups are used to find offsets in the new bytecode that
+ // can be jumped to.
+ void AddJumpDestinationFixup(int fixup, int pos);
+ // Sets an absolute jump destination fixup at pos.
+ void SetJumpDestinationFixup(int fixup, int pos);
+ // Prepare internal structures used to fixup jumps.
+ void PrepareJumpStructures(const ZoneUnorderedMap<int, int>& jump_edges);
+ // Updates all jump targets in the new bytecode.
+ void FixJumps();
+ // Update a single jump.
+ void FixJump(int jump_source, int jump_destination);
+ void AddSentinelFixups(int pos);
+ template <typename T>
+ void EmitValue(T value);
+ template <typename T>
+ void OverwriteValue(int offset, T value);
+ void CopyRangeToOutput(const byte* orig_bytecode, int start, int length);
+ void SetRange(byte value, int count);
+ void EmitArgument(int start_pc, const byte* bytecode,
+ BytecodeArgumentMapping arg);
+ int pc() const;
+ Zone* zone() const;
+
+ ZoneVector<byte> optimized_bytecode_buffer_;
+ BytecodeSequenceNode* sequences_;
+ // Jumps used in old bytecode.
+ // Key: Jump source (offset where destination is stored in old bytecode)
+ // Value: Destination
+ ZoneMap<int, int> jump_edges_;
+ // Jumps used in new bytecode.
+ // Key: Jump source (offset where destination is stored in new bytecode)
+ // Value: Destination
+ ZoneMap<int, int> jump_edges_mapped_;
+ // Number of times a jump destination is used within the bytecode.
+ // Key: Jump destination (offset in old bytecode).
+ // Value: Number of times jump destination is used.
+ ZoneMap<int, int> jump_usage_counts_;
+ // Maps offsets in old bytecode to fixups of sources (delta to new bytecode).
+ // Key: Offset in old bytecode from where the fixup is valid.
+ // Value: Delta to map jump source from old bytecode to new bytecode in bytes.
+ ZoneMap<int, int> jump_source_fixups_;
+ // Maps offsets in old bytecode to fixups of destinations (delta to new
+ // bytecode).
+ // Key: Offset in old bytecode from where the fixup is valid.
+ // Value: Delta to map jump destinations from old bytecode to new bytecode in
+ // bytes.
+ ZoneMap<int, int> jump_destination_fixups_;
+
+ Zone* zone_;
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodePeephole);
+};
+
+template <typename T>
+T GetValue(const byte* buffer, int pos) {
+ DCHECK(IsAligned(reinterpret_cast<Address>(buffer + pos), alignof(T)));
+ return *reinterpret_cast<const T*>(buffer + pos);
+}
+
+int32_t GetArgumentValue(const byte* bytecode, int offset, int length) {
+ switch (length) {
+ case 1:
+ return GetValue<byte>(bytecode, offset);
+ case 2:
+ return GetValue<int16_t>(bytecode, offset);
+ case 4:
+ return GetValue<int32_t>(bytecode, offset);
+ default:
+ UNREACHABLE();
+ }
+}
+
+BytecodeSequenceNode::BytecodeSequenceNode(int bytecode, Zone* zone)
+ : bytecode_(bytecode),
+ bytecode_replacement_(kDummyBytecode),
+ index_in_sequence_(0),
+ start_offset_(0),
+ parent_(nullptr),
+ children_(ZoneUnorderedMap<int, BytecodeSequenceNode*>(zone)),
+ argument_mapping_(zone->New<ZoneVector<BytecodeArgumentMapping>>(zone)),
+ argument_check_(zone->New<ZoneLinkedList<BytecodeArgumentCheck>>(zone)),
+ argument_ignored_(zone->New<ZoneLinkedList<BytecodeArgument>>(zone)),
+ zone_(zone) {}
+
+BytecodeSequenceNode& BytecodeSequenceNode::FollowedBy(int bytecode) {
+ DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount);
+
+ if (children_.find(bytecode) == children_.end()) {
+ BytecodeSequenceNode* new_node =
+ zone()->New<BytecodeSequenceNode>(bytecode, zone());
+ // If node is not the first in the sequence, set offsets and parent.
+ if (bytecode_ != kDummyBytecode) {
+ new_node->start_offset_ = start_offset_ + RegExpBytecodeLength(bytecode_);
+ new_node->index_in_sequence_ = index_in_sequence_ + 1;
+ new_node->parent_ = this;
+ }
+ children_[bytecode] = new_node;
+ }
+
+ return *children_[bytecode];
+}
+
+BytecodeSequenceNode& BytecodeSequenceNode::ReplaceWith(int bytecode) {
+ DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount);
+
+ bytecode_replacement_ = bytecode;
+
+ return *this;
+}
+
+BytecodeSequenceNode& BytecodeSequenceNode::MapArgument(
+ int bytecode_index_in_sequence, int argument_offset,
+ int argument_byte_length, int new_argument_byte_length) {
+ DCHECK(IsSequence());
+ DCHECK_LE(bytecode_index_in_sequence, index_in_sequence_);
+
+ BytecodeSequenceNode& ref_node =
+ GetNodeByIndexInSequence(bytecode_index_in_sequence);
+ DCHECK_LT(argument_offset, RegExpBytecodeLength(ref_node.bytecode_));
+
+ int absolute_offset = ref_node.start_offset_ + argument_offset;
+ if (new_argument_byte_length == 0) {
+ new_argument_byte_length = argument_byte_length;
+ }
+
+ argument_mapping_->push_back(BytecodeArgumentMapping{
+ absolute_offset, argument_byte_length, new_argument_byte_length});
+
+ return *this;
+}
+
+BytecodeSequenceNode& BytecodeSequenceNode::IfArgumentEqualsOffset(
+ int argument_offset, int argument_byte_length, int check_byte_offset) {
+ DCHECK_LT(argument_offset, RegExpBytecodeLength(bytecode_));
+ DCHECK(argument_byte_length == 1 || argument_byte_length == 2 ||
+ argument_byte_length == 4);
+
+ int absolute_offset = start_offset_ + argument_offset;
+
+ argument_check_->push_back(BytecodeArgumentCheck{
+ absolute_offset, argument_byte_length, check_byte_offset});
+
+ return *this;
+}
+
+BytecodeSequenceNode& BytecodeSequenceNode::IfArgumentEqualsValueAtOffset(
+ int argument_offset, int argument_byte_length,
+ int other_bytecode_index_in_sequence, int other_argument_offset,
+ int other_argument_byte_length) {
+ DCHECK_LT(argument_offset, RegExpBytecodeLength(bytecode_));
+ DCHECK_LE(other_bytecode_index_in_sequence, index_in_sequence_);
+ DCHECK_EQ(argument_byte_length, other_argument_byte_length);
+
+ BytecodeSequenceNode& ref_node =
+ GetNodeByIndexInSequence(other_bytecode_index_in_sequence);
+ DCHECK_LT(other_argument_offset, RegExpBytecodeLength(ref_node.bytecode_));
+
+ int absolute_offset = start_offset_ + argument_offset;
+ int other_absolute_offset = ref_node.start_offset_ + other_argument_offset;
+
+ argument_check_->push_back(
+ BytecodeArgumentCheck{absolute_offset, argument_byte_length,
+ other_absolute_offset, other_argument_byte_length});
+
+ return *this;
+}
+
+BytecodeSequenceNode& BytecodeSequenceNode::IgnoreArgument(
+ int bytecode_index_in_sequence, int argument_offset,
+ int argument_byte_length) {
+ DCHECK(IsSequence());
+ DCHECK_LE(bytecode_index_in_sequence, index_in_sequence_);
+
+ BytecodeSequenceNode& ref_node =
+ GetNodeByIndexInSequence(bytecode_index_in_sequence);
+ DCHECK_LT(argument_offset, RegExpBytecodeLength(ref_node.bytecode_));
+
+ int absolute_offset = ref_node.start_offset_ + argument_offset;
+
+ argument_ignored_->push_back(
+ BytecodeArgument{absolute_offset, argument_byte_length});
+
+ return *this;
+}
+
+bool BytecodeSequenceNode::CheckArguments(const byte* bytecode, int pc) {
+ bool is_valid = true;
+ for (auto check_iter = argument_check_->begin();
+ check_iter != argument_check_->end() && is_valid; check_iter++) {
+ auto value =
+ GetArgumentValue(bytecode, pc + check_iter->offset, check_iter->length);
+ if (check_iter->type == BytecodeArgumentCheck::kCheckAddress) {
+ is_valid &= value == pc + check_iter->check_offset;
+ } else if (check_iter->type == BytecodeArgumentCheck::kCheckValue) {
+ auto other_value = GetArgumentValue(
+ bytecode, pc + check_iter->check_offset, check_iter->check_length);
+ is_valid &= value == other_value;
+ } else {
+ UNREACHABLE();
+ }
+ }
+ return is_valid;
+}
+
+bool BytecodeSequenceNode::IsSequence() const {
+ return bytecode_replacement_ != kDummyBytecode;
+}
+
+int BytecodeSequenceNode::SequenceLength() const {
+ return start_offset_ + RegExpBytecodeLength(bytecode_);
+}
+
+int BytecodeSequenceNode::OptimizedBytecode() const {
+ return bytecode_replacement_;
+}
+
+BytecodeSequenceNode* BytecodeSequenceNode::Find(int bytecode) const {
+ auto found = children_.find(bytecode);
+ if (found == children_.end()) return nullptr;
+ return found->second;
+}
+
+size_t BytecodeSequenceNode::ArgumentSize() const {
+ DCHECK(IsSequence());
+ return argument_mapping_->size();
+}
+
+BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping(
+ size_t index) const {
+ DCHECK(IsSequence());
+ DCHECK(argument_mapping_ != nullptr);
+ DCHECK_LT(index, argument_mapping_->size());
+
+ return argument_mapping_->at(index);
+}
+
+ZoneLinkedList<BytecodeArgument>::iterator
+BytecodeSequenceNode::ArgumentIgnoredBegin() const {
+ DCHECK(IsSequence());
+ DCHECK(argument_ignored_ != nullptr);
+ return argument_ignored_->begin();
+}
+
+ZoneLinkedList<BytecodeArgument>::iterator
+BytecodeSequenceNode::ArgumentIgnoredEnd() const {
+ DCHECK(IsSequence());
+ DCHECK(argument_ignored_ != nullptr);
+ return argument_ignored_->end();
+}
+
+bool BytecodeSequenceNode::HasIgnoredArguments() const {
+ return argument_ignored_ != nullptr;
+}
+
+BytecodeSequenceNode& BytecodeSequenceNode::GetNodeByIndexInSequence(
+ int index_in_sequence) {
+ DCHECK_LE(index_in_sequence, index_in_sequence_);
+
+ if (index_in_sequence < index_in_sequence_) {
+ DCHECK(parent_ != nullptr);
+ return parent_->GetNodeByIndexInSequence(index_in_sequence);
+ } else {
+ return *this;
+ }
+}
+
+Zone* BytecodeSequenceNode::zone() const { return zone_; }
+
+RegExpBytecodePeephole::RegExpBytecodePeephole(
+ Zone* zone, size_t buffer_size,
+ const ZoneUnorderedMap<int, int>& jump_edges)
+ : optimized_bytecode_buffer_(zone),
+ sequences_(zone->New<BytecodeSequenceNode>(
+ BytecodeSequenceNode::kDummyBytecode, zone)),
+ jump_edges_(zone),
+ jump_edges_mapped_(zone),
+ jump_usage_counts_(zone),
+ jump_source_fixups_(zone),
+ jump_destination_fixups_(zone),
+ zone_(zone) {
+ optimized_bytecode_buffer_.reserve(buffer_size);
+ PrepareJumpStructures(jump_edges);
+ DefineStandardSequences();
+ // Sentinel fixups at beginning of bytecode (position -1) so we don't have to
+ // check for end of iterator inside the fixup loop.
+ // In general fixups are deltas of original offsets of jump
+ // sources/destinations (in the old bytecode) to find them in the new
+ // bytecode. All jump targets are fixed after the new bytecode is fully
+ // emitted in the internal buffer.
+ AddSentinelFixups(-1);
+ // Sentinel fixups at end of (old) bytecode so we don't have to check for
+ // end of iterator inside the fixup loop.
+ DCHECK_LE(buffer_size, std::numeric_limits<int>::max());
+ AddSentinelFixups(static_cast<int>(buffer_size));
+}
+
+void RegExpBytecodePeephole::DefineStandardSequences() {
+ // Commonly used sequences can be found by creating regexp bytecode traces
+ // (--trace-regexp-bytecodes) and using v8/tools/regexp-sequences.py.
+ CreateSequence(BC_LOAD_CURRENT_CHAR)
+ .FollowedBy(BC_CHECK_BIT_IN_TABLE)
+ .FollowedBy(BC_ADVANCE_CP_AND_GOTO)
+ // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the
+ // first bytecode in this sequence.
+ .IfArgumentEqualsOffset(4, 4, 0)
+ .ReplaceWith(BC_SKIP_UNTIL_BIT_IN_TABLE)
+ .MapArgument(0, 1, 3) // load offset
+ .MapArgument(2, 1, 3, 4) // advance by
+ .MapArgument(1, 8, 16) // bit table
+ .MapArgument(1, 4, 4) // goto when match
+ .MapArgument(0, 4, 4) // goto on failure
+ .IgnoreArgument(2, 4, 4); // loop jump
+
+ CreateSequence(BC_CHECK_CURRENT_POSITION)
+ .FollowedBy(BC_LOAD_CURRENT_CHAR_UNCHECKED)
+ .FollowedBy(BC_CHECK_CHAR)
+ .FollowedBy(BC_ADVANCE_CP_AND_GOTO)
+ // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the
+ // first bytecode in this sequence.
+ .IfArgumentEqualsOffset(4, 4, 0)
+ .ReplaceWith(BC_SKIP_UNTIL_CHAR_POS_CHECKED)
+ .MapArgument(1, 1, 3) // load offset
+ .MapArgument(3, 1, 3, 2) // advance_by
+ .MapArgument(2, 1, 3, 2) // c
+ .MapArgument(0, 1, 3, 4) // eats at least
+ .MapArgument(2, 4, 4) // goto when match
+ .MapArgument(0, 4, 4) // goto on failure
+ .IgnoreArgument(3, 4, 4); // loop jump
+
+ CreateSequence(BC_CHECK_CURRENT_POSITION)
+ .FollowedBy(BC_LOAD_CURRENT_CHAR_UNCHECKED)
+ .FollowedBy(BC_AND_CHECK_CHAR)
+ .FollowedBy(BC_ADVANCE_CP_AND_GOTO)
+ // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the
+ // first bytecode in this sequence.
+ .IfArgumentEqualsOffset(4, 4, 0)
+ .ReplaceWith(BC_SKIP_UNTIL_CHAR_AND)
+ .MapArgument(1, 1, 3) // load offset
+ .MapArgument(3, 1, 3, 2) // advance_by
+ .MapArgument(2, 1, 3, 2) // c
+ .MapArgument(2, 4, 4) // mask
+ .MapArgument(0, 1, 3, 4) // eats at least
+ .MapArgument(2, 8, 4) // goto when match
+ .MapArgument(0, 4, 4) // goto on failure
+ .IgnoreArgument(3, 4, 4); // loop jump
+
+ // TODO(pthier): It might make sense for short sequences like this one to only
+ // optimize them if the resulting optimization is not longer than the current
+ // one. This could be the case if there are jumps inside the sequence and we
+ // have to replicate parts of the sequence. A method to mark such sequences
+ // might be useful.
+ CreateSequence(BC_LOAD_CURRENT_CHAR)
+ .FollowedBy(BC_CHECK_CHAR)
+ .FollowedBy(BC_ADVANCE_CP_AND_GOTO)
+ // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the
+ // first bytecode in this sequence.
+ .IfArgumentEqualsOffset(4, 4, 0)
+ .ReplaceWith(BC_SKIP_UNTIL_CHAR)
+ .MapArgument(0, 1, 3) // load offset
+ .MapArgument(2, 1, 3, 2) // advance by
+ .MapArgument(1, 1, 3, 2) // character
+ .MapArgument(1, 4, 4) // goto when match
+ .MapArgument(0, 4, 4) // goto on failure
+ .IgnoreArgument(2, 4, 4); // loop jump
+
+ CreateSequence(BC_LOAD_CURRENT_CHAR)
+ .FollowedBy(BC_CHECK_CHAR)
+ .FollowedBy(BC_CHECK_CHAR)
+ // Sequence is only valid if the jump targets of both CHECK_CHAR bytecodes
+ // are equal.
+ .IfArgumentEqualsValueAtOffset(4, 4, 1, 4, 4)
+ .FollowedBy(BC_ADVANCE_CP_AND_GOTO)
+ // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the
+ // first bytecode in this sequence.
+ .IfArgumentEqualsOffset(4, 4, 0)
+ .ReplaceWith(BC_SKIP_UNTIL_CHAR_OR_CHAR)
+ .MapArgument(0, 1, 3) // load offset
+ .MapArgument(3, 1, 3, 4) // advance by
+ .MapArgument(1, 1, 3, 2) // character 1
+ .MapArgument(2, 1, 3, 2) // character 2
+ .MapArgument(1, 4, 4) // goto when match
+ .MapArgument(0, 4, 4) // goto on failure
+ .IgnoreArgument(2, 4, 4) // goto when match 2
+ .IgnoreArgument(3, 4, 4); // loop jump
+
+ CreateSequence(BC_LOAD_CURRENT_CHAR)
+ .FollowedBy(BC_CHECK_GT)
+ // Sequence is only valid if the jump target of CHECK_GT is the first
+ // bytecode AFTER the whole sequence.
+ .IfArgumentEqualsOffset(4, 4, 56)
+ .FollowedBy(BC_CHECK_BIT_IN_TABLE)
+ // Sequence is only valid if the jump target of CHECK_BIT_IN_TABLE is
+ // the ADVANCE_CP_AND_GOTO bytecode at the end of the sequence.
+ .IfArgumentEqualsOffset(4, 4, 48)
+ .FollowedBy(BC_GOTO)
+ // Sequence is only valid if the jump target of GOTO is the same as the
+ // jump target of CHECK_GT (i.e. both jump to the first bytecode AFTER the
+ // whole sequence.
+ .IfArgumentEqualsValueAtOffset(4, 4, 1, 4, 4)
+ .FollowedBy(BC_ADVANCE_CP_AND_GOTO)
+ // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the
+ // first bytecode in this sequence.
+ .IfArgumentEqualsOffset(4, 4, 0)
+ .ReplaceWith(BC_SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE)
+ .MapArgument(0, 1, 3) // load offset
+ .MapArgument(4, 1, 3, 2) // advance by
+ .MapArgument(1, 1, 3, 2) // character
+ .MapArgument(2, 8, 16) // bit table
+ .MapArgument(1, 4, 4) // goto when match
+ .MapArgument(0, 4, 4) // goto on failure
+ .IgnoreArgument(2, 4, 4) // indirect loop jump
+ .IgnoreArgument(3, 4, 4) // jump out of loop
+ .IgnoreArgument(4, 4, 4); // loop jump
+}
+
+bool RegExpBytecodePeephole::OptimizeBytecode(const byte* bytecode,
+ int length) {
+ int old_pc = 0;
+ bool did_optimize = false;
+
+ while (old_pc < length) {
+ int replaced_len = TryOptimizeSequence(bytecode, length, old_pc);
+ if (replaced_len > 0) {
+ old_pc += replaced_len;
+ did_optimize = true;
+ } else {
+ int bc = bytecode[old_pc];
+ int bc_len = RegExpBytecodeLength(bc);
+ CopyRangeToOutput(bytecode, old_pc, bc_len);
+ old_pc += bc_len;
+ }
+ }
+
+ if (did_optimize) {
+ FixJumps();
+ }
+
+ return did_optimize;
+}
+
+void RegExpBytecodePeephole::CopyOptimizedBytecode(byte* to_address) const {
+ MemCopy(to_address, &(*optimized_bytecode_buffer_.begin()), Length());
+}
+
+int RegExpBytecodePeephole::Length() const { return pc(); }
+
+BytecodeSequenceNode& RegExpBytecodePeephole::CreateSequence(int bytecode) {
+ DCHECK(sequences_ != nullptr);
+ DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount);
+
+ return sequences_->FollowedBy(bytecode);
+}
+
+int RegExpBytecodePeephole::TryOptimizeSequence(const byte* bytecode,
+ int bytecode_length,
+ int start_pc) {
+ BytecodeSequenceNode* seq_node = sequences_;
+ BytecodeSequenceNode* valid_seq_end = nullptr;
+
+ int current_pc = start_pc;
+
+ // Check for the longest valid sequence matching any of the pre-defined
+ // sequences in the Trie data structure.
+ while (current_pc < bytecode_length) {
+ seq_node = seq_node->Find(bytecode[current_pc]);
+ if (seq_node == nullptr) break;
+ if (!seq_node->CheckArguments(bytecode, start_pc)) break;
+
+ if (seq_node->IsSequence()) valid_seq_end = seq_node;
+ current_pc += RegExpBytecodeLength(bytecode[current_pc]);
+ }
+
+ if (valid_seq_end) {
+ EmitOptimization(start_pc, bytecode, *valid_seq_end);
+ return valid_seq_end->SequenceLength();
+ }
+
+ return 0;
+}
+
+void RegExpBytecodePeephole::EmitOptimization(
+ int start_pc, const byte* bytecode, const BytecodeSequenceNode& last_node) {
+#ifdef DEBUG
+ int optimized_start_pc = pc();
+#endif
+ // Jump sources that are mapped or marked as unused will be deleted at the end
+ // of this method. We don't delete them immediately as we might need the
+ // information when we have to preserve bytecodes at the end.
+ // TODO(pthier): Replace with a stack-allocated data structure.
+ ZoneLinkedList<int> delete_jumps = ZoneLinkedList<int>(zone());
+
+ uint32_t bc = last_node.OptimizedBytecode();
+ EmitValue(bc);
+
+ for (size_t arg = 0; arg < last_node.ArgumentSize(); arg++) {
+ BytecodeArgumentMapping arg_map = last_node.ArgumentMapping(arg);
+ int arg_pos = start_pc + arg_map.offset;
+ // If we map any jump source we mark the old source for deletion and insert
+ // a new jump.
+ auto jump_edge_iter = jump_edges_.find(arg_pos);
+ if (jump_edge_iter != jump_edges_.end()) {
+ int jump_source = jump_edge_iter->first;
+ int jump_destination = jump_edge_iter->second;
+ // Add new jump edge add current position.
+ jump_edges_mapped_.emplace(Length(), jump_destination);
+ // Mark old jump edge for deletion.
+ delete_jumps.push_back(jump_source);
+ // Decrement usage count of jump destination.
+ auto jump_count_iter = jump_usage_counts_.find(jump_destination);
+ DCHECK(jump_count_iter != jump_usage_counts_.end());
+ int& usage_count = jump_count_iter->second;
+ --usage_count;
+ }
+ // TODO(pthier): DCHECK that mapped arguments are never sources of jumps
+ // to destinations inside the sequence.
+ EmitArgument(start_pc, bytecode, arg_map);
+ }
+ DCHECK_EQ(pc(), optimized_start_pc +
+ RegExpBytecodeLength(last_node.OptimizedBytecode()));
+
+ // Remove jumps from arguments we ignore.
+ if (last_node.HasIgnoredArguments()) {
+ for (auto ignored_arg = last_node.ArgumentIgnoredBegin();
+ ignored_arg != last_node.ArgumentIgnoredEnd(); ignored_arg++) {
+ auto jump_edge_iter = jump_edges_.find(start_pc + ignored_arg->offset);
+ if (jump_edge_iter != jump_edges_.end()) {
+ int jump_source = jump_edge_iter->first;
+ int jump_destination = jump_edge_iter->second;
+ // Mark old jump edge for deletion.
+ delete_jumps.push_back(jump_source);
+ // Decrement usage count of jump destination.
+ auto jump_count_iter = jump_usage_counts_.find(jump_destination);
+ DCHECK(jump_count_iter != jump_usage_counts_.end());
+ int& usage_count = jump_count_iter->second;
+ --usage_count;
+ }
+ }
+ }
+
+ int fixup_length = RegExpBytecodeLength(bc) - last_node.SequenceLength();
+
+ // Check if there are any jumps inside the old sequence.
+ // If so we have to keep the bytecodes that are jumped to around.
+ auto jump_destination_candidate = jump_usage_counts_.upper_bound(start_pc);
+ int jump_candidate_destination = jump_destination_candidate->first;
+ int jump_candidate_count = jump_destination_candidate->second;
+ // Jump destinations only jumped to from inside the sequence will be ignored.
+ while (jump_destination_candidate != jump_usage_counts_.end() &&
+ jump_candidate_count == 0) {
+ ++jump_destination_candidate;
+ jump_candidate_destination = jump_destination_candidate->first;
+ jump_candidate_count = jump_destination_candidate->second;
+ }
+
+ int preserve_from = start_pc + last_node.SequenceLength();
+ if (jump_destination_candidate != jump_usage_counts_.end() &&
+ jump_candidate_destination < start_pc + last_node.SequenceLength()) {
+ preserve_from = jump_candidate_destination;
+ // Check if any jump in the sequence we are preserving has a jump
+ // destination inside the optimized sequence before the current position we
+ // want to preserve. If so we have to preserve all bytecodes starting at
+ // this jump destination.
+ for (auto jump_iter = jump_edges_.lower_bound(preserve_from);
+ jump_iter != jump_edges_.end() &&
+ jump_iter->first /* jump source */ <
+ start_pc + last_node.SequenceLength();
+ ++jump_iter) {
+ int jump_destination = jump_iter->second;
+ if (jump_destination > start_pc && jump_destination < preserve_from) {
+ preserve_from = jump_destination;
+ }
+ }
+
+ // We preserve everything to the end of the sequence. This is conservative
+ // since it would be enough to preserve all bytecudes up to an unconditional
+ // jump.
+ int preserve_length = start_pc + last_node.SequenceLength() - preserve_from;
+ fixup_length += preserve_length;
+ // Jumps after the start of the preserved sequence need fixup.
+ AddJumpSourceFixup(fixup_length,
+ start_pc + last_node.SequenceLength() - preserve_length);
+ // All jump targets after the start of the optimized sequence need to be
+ // fixed relative to the length of the optimized sequence including
+ // bytecodes we preserved.
+ AddJumpDestinationFixup(fixup_length, start_pc + 1);
+ // Jumps to the sequence we preserved need absolute fixup as they could
+ // occur before or after the sequence.
+ SetJumpDestinationFixup(pc() - preserve_from, preserve_from);
+ CopyRangeToOutput(bytecode, preserve_from, preserve_length);
+ } else {
+ AddJumpDestinationFixup(fixup_length, start_pc + 1);
+ // Jumps after the end of the old sequence need fixup.
+ AddJumpSourceFixup(fixup_length, start_pc + last_node.SequenceLength());
+ }
+
+ // Delete jumps we definitely don't need anymore
+ for (int del : delete_jumps) {
+ if (del < preserve_from) {
+ jump_edges_.erase(del);
+ }
+ }
+}
+
+void RegExpBytecodePeephole::AddJumpSourceFixup(int fixup, int pos) {
+ auto previous_fixup = jump_source_fixups_.lower_bound(pos);
+ DCHECK(previous_fixup != jump_source_fixups_.end());
+ DCHECK(previous_fixup != jump_source_fixups_.begin());
+
+ int previous_fixup_value = (--previous_fixup)->second;
+ jump_source_fixups_[pos] = previous_fixup_value + fixup;
+}
+
+void RegExpBytecodePeephole::AddJumpDestinationFixup(int fixup, int pos) {
+ auto previous_fixup = jump_destination_fixups_.lower_bound(pos);
+ DCHECK(previous_fixup != jump_destination_fixups_.end());
+ DCHECK(previous_fixup != jump_destination_fixups_.begin());
+
+ int previous_fixup_value = (--previous_fixup)->second;
+ jump_destination_fixups_[pos] = previous_fixup_value + fixup;
+}
+
+void RegExpBytecodePeephole::SetJumpDestinationFixup(int fixup, int pos) {
+ auto previous_fixup = jump_destination_fixups_.lower_bound(pos);
+ DCHECK(previous_fixup != jump_destination_fixups_.end());
+ DCHECK(previous_fixup != jump_destination_fixups_.begin());
+
+ int previous_fixup_value = (--previous_fixup)->second;
+ jump_destination_fixups_.emplace(pos, fixup);
+ jump_destination_fixups_.emplace(pos + 1, previous_fixup_value);
+}
+
+void RegExpBytecodePeephole::PrepareJumpStructures(
+ const ZoneUnorderedMap<int, int>& jump_edges) {
+ for (auto jump_edge : jump_edges) {
+ int jump_source = jump_edge.first;
+ int jump_destination = jump_edge.second;
+
+ jump_edges_.emplace(jump_source, jump_destination);
+ jump_usage_counts_[jump_destination]++;
+ }
+}
+
+void RegExpBytecodePeephole::FixJumps() {
+ int position_fixup = 0;
+ // Next position where fixup changes.
+ auto next_source_fixup = jump_source_fixups_.lower_bound(0);
+ int next_source_fixup_offset = next_source_fixup->first;
+ int next_source_fixup_value = next_source_fixup->second;
+
+ for (auto jump_edge : jump_edges_) {
+ int jump_source = jump_edge.first;
+ int jump_destination = jump_edge.second;
+ while (jump_source >= next_source_fixup_offset) {
+ position_fixup = next_source_fixup_value;
+ ++next_source_fixup;
+ next_source_fixup_offset = next_source_fixup->first;
+ next_source_fixup_value = next_source_fixup->second;
+ }
+ jump_source += position_fixup;
+
+ FixJump(jump_source, jump_destination);
+ }
+
+ // Mapped jump edges don't need source fixups, as the position already is an
+ // offset in the new bytecode.
+ for (auto jump_edge : jump_edges_mapped_) {
+ int jump_source = jump_edge.first;
+ int jump_destination = jump_edge.second;
+
+ FixJump(jump_source, jump_destination);
+ }
+}
+
+void RegExpBytecodePeephole::FixJump(int jump_source, int jump_destination) {
+ int fixed_jump_destination =
+ jump_destination +
+ (--jump_destination_fixups_.upper_bound(jump_destination))->second;
+ DCHECK_LT(fixed_jump_destination, Length());
+#ifdef DEBUG
+ // TODO(pthier): This check could be better if we track the bytecodes
+ // actually used and check if we jump to one of them.
+ byte jump_bc = optimized_bytecode_buffer_[fixed_jump_destination];
+ DCHECK_GT(jump_bc, 0);
+ DCHECK_LT(jump_bc, kRegExpBytecodeCount);
+#endif
+
+ if (jump_destination != fixed_jump_destination) {
+ OverwriteValue<uint32_t>(jump_source, fixed_jump_destination);
+ }
+}
+
+void RegExpBytecodePeephole::AddSentinelFixups(int pos) {
+ jump_source_fixups_.emplace(pos, 0);
+ jump_destination_fixups_.emplace(pos, 0);
+}
+
+template <typename T>
+void RegExpBytecodePeephole::EmitValue(T value) {
+ DCHECK(optimized_bytecode_buffer_.begin() + pc() ==
+ optimized_bytecode_buffer_.end());
+ byte* value_byte_iter = reinterpret_cast<byte*>(&value);
+ optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(),
+ value_byte_iter,
+ value_byte_iter + sizeof(T));
+}
+
+template <typename T>
+void RegExpBytecodePeephole::OverwriteValue(int offset, T value) {
+ byte* value_byte_iter = reinterpret_cast<byte*>(&value);
+ byte* value_byte_iter_end = value_byte_iter + sizeof(T);
+ while (value_byte_iter < value_byte_iter_end) {
+ optimized_bytecode_buffer_[offset++] = *value_byte_iter++;
+ }
+}
+
+void RegExpBytecodePeephole::CopyRangeToOutput(const byte* orig_bytecode,
+ int start, int length) {
+ DCHECK(optimized_bytecode_buffer_.begin() + pc() ==
+ optimized_bytecode_buffer_.end());
+ optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(),
+ orig_bytecode + start,
+ orig_bytecode + start + length);
+}
+
+void RegExpBytecodePeephole::SetRange(byte value, int count) {
+ DCHECK(optimized_bytecode_buffer_.begin() + pc() ==
+ optimized_bytecode_buffer_.end());
+ optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), count,
+ value);
+}
+
+void RegExpBytecodePeephole::EmitArgument(int start_pc, const byte* bytecode,
+ BytecodeArgumentMapping arg) {
+ int arg_pos = start_pc + arg.offset;
+ switch (arg.length) {
+ case 1:
+ DCHECK_EQ(arg.new_length, arg.length);
+ EmitValue(GetValue<byte>(bytecode, arg_pos));
+ break;
+ case 2:
+ DCHECK_EQ(arg.new_length, arg.length);
+ EmitValue(GetValue<uint16_t>(bytecode, arg_pos));
+ break;
+ case 3: {
+ // Length 3 only occurs in 'packed' arguments where the lowermost byte is
+ // the current bytecode, and the remaining 3 bytes are the packed value.
+ //
+ // We load 4 bytes from position - 1 and shift out the bytecode.
+#ifdef V8_TARGET_BIG_ENDIAN
+ UNIMPLEMENTED();
+ int32_t val = 0;
+#else
+ int32_t val = GetValue<int32_t>(bytecode, arg_pos - 1) >> kBitsPerByte;
+#endif // V8_TARGET_BIG_ENDIAN
+
+ switch (arg.new_length) {
+ case 2:
+ EmitValue<uint16_t>(val);
+ break;
+ case 3: {
+ // Pack with previously emitted value.
+ auto prev_val =
+ GetValue<int32_t>(&(*optimized_bytecode_buffer_.begin()),
+ Length() - sizeof(uint32_t));
+#ifdef V8_TARGET_BIG_ENDIAN
+ UNIMPLEMENTED();
+ USE(prev_val);
+#else
+ DCHECK_EQ(prev_val & 0xFFFFFF00, 0);
+ OverwriteValue<uint32_t>(
+ pc() - sizeof(uint32_t),
+ (static_cast<uint32_t>(val) << 8) | (prev_val & 0xFF));
+#endif // V8_TARGET_BIG_ENDIAN
+ break;
+ }
+ case 4:
+ EmitValue<uint32_t>(val);
+ break;
+ }
+ break;
+ }
+ case 4:
+ DCHECK_EQ(arg.new_length, arg.length);
+ EmitValue(GetValue<uint32_t>(bytecode, arg_pos));
+ break;
+ case 8:
+ DCHECK_EQ(arg.new_length, arg.length);
+ EmitValue(GetValue<uint64_t>(bytecode, arg_pos));
+ break;
+ default:
+ CopyRangeToOutput(bytecode, arg_pos,
+ std::min(arg.length, arg.new_length));
+ if (arg.length < arg.new_length) {
+ SetRange(0x00, arg.new_length - arg.length);
+ }
+ break;
+ }
+}
+
+int RegExpBytecodePeephole::pc() const {
+ DCHECK_LE(optimized_bytecode_buffer_.size(), std::numeric_limits<int>::max());
+ return static_cast<int>(optimized_bytecode_buffer_.size());
+}
+
+Zone* RegExpBytecodePeephole::zone() const { return zone_; }
+
+} // namespace
+
+// static
+Handle<ByteArray> RegExpBytecodePeepholeOptimization::OptimizeBytecode(
+ Isolate* isolate, Zone* zone, Handle<String> source, const byte* bytecode,
+ int length, const ZoneUnorderedMap<int, int>& jump_edges) {
+ RegExpBytecodePeephole peephole(zone, length, jump_edges);
+ bool did_optimize = peephole.OptimizeBytecode(bytecode, length);
+ Handle<ByteArray> array = isolate->factory()->NewByteArray(peephole.Length());
+ peephole.CopyOptimizedBytecode(array->GetDataStartAddress());
+
+ if (did_optimize && v8_flags.trace_regexp_peephole_optimization) {
+ PrintF("Original Bytecode:\n");
+ RegExpBytecodeDisassemble(bytecode, length, source->ToCString().get());
+ PrintF("Optimized Bytecode:\n");
+ RegExpBytecodeDisassemble(array->GetDataStartAddress(), peephole.Length(),
+ source->ToCString().get());
+ }
+
+ return array;
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.h b/js/src/irregexp/imported/regexp-bytecode-peephole.h
new file mode 100644
index 0000000000..5b8a0c7b4b
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecode-peephole.h
@@ -0,0 +1,30 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
+#define V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class ByteArray;
+
+// Peephole optimization for regexp interpreter bytecode.
+// Pre-defined bytecode sequences occuring in the bytecode generated by the
+// RegExpBytecodeGenerator can be optimized into a single bytecode.
+class RegExpBytecodePeepholeOptimization : public AllStatic {
+ public:
+ // Performs peephole optimization on the given bytecode and returns the
+ // optimized bytecode.
+ static Handle<ByteArray> OptimizeBytecode(
+ Isolate* isolate, Zone* zone, Handle<String> source, const byte* bytecode,
+ int length, const ZoneUnorderedMap<int, int>& jump_edges);
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
diff --git a/js/src/irregexp/imported/regexp-bytecodes.cc b/js/src/irregexp/imported/regexp-bytecodes.cc
new file mode 100644
index 0000000000..829bea9180
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecodes.cc
@@ -0,0 +1,46 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-bytecodes.h"
+
+#include <cctype>
+
+
+namespace v8 {
+namespace internal {
+
+void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc) {
+ int bytecode = *reinterpret_cast<const int32_t*>(pc) & BYTECODE_MASK;
+ PrintF("%s", RegExpBytecodeName(bytecode));
+
+ // Args and the bytecode as hex.
+ for (int i = 0; i < RegExpBytecodeLength(bytecode); i++) {
+ PrintF(", %02x", pc[i]);
+ }
+ PrintF(" ");
+
+ // Args as ascii.
+ for (int i = 1; i < RegExpBytecodeLength(bytecode); i++) {
+ unsigned char b = pc[i];
+ PrintF("%c", std::isprint(b) ? b : '.');
+ }
+ PrintF("\n");
+}
+
+void RegExpBytecodeDisassemble(const byte* code_base, int length,
+ const char* pattern) {
+ PrintF("[generated bytecode for regexp pattern: '%s']\n", pattern);
+
+ ptrdiff_t offset = 0;
+
+ while (offset < length) {
+ const byte* const pc = code_base + offset;
+ PrintF("%p %4" V8PRIxPTRDIFF " ", pc, offset);
+ RegExpBytecodeDisassembleSingle(code_base, pc);
+ offset += RegExpBytecodeLength(*pc);
+ }
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-bytecodes.h b/js/src/irregexp/imported/regexp-bytecodes.h
new file mode 100644
index 0000000000..5602d8d7bc
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-bytecodes.h
@@ -0,0 +1,257 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_BYTECODES_H_
+#define V8_REGEXP_REGEXP_BYTECODES_H_
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+// Maximum number of bytecodes that will be used (next power of 2 of actually
+// defined bytecodes).
+// All slots between the last actually defined bytecode and maximum id will be
+// filled with BREAKs, indicating an invalid operation. This way using
+// BYTECODE_MASK guarantees no OOB access to the dispatch table.
+constexpr int kRegExpPaddedBytecodeCount = 1 << 6;
+constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
+// The first argument is packed in with the byte code in one word, but so it
+// has 24 bits, but it can be positive and negative so only use 23 bits for
+// positive values.
+const unsigned int MAX_FIRST_ARG = 0x7fffffu;
+const int BYTECODE_SHIFT = 8;
+static_assert(1 << BYTECODE_SHIFT > BYTECODE_MASK);
+
+// The list of bytecodes, in format: V(Name, Code, ByteLength).
+// TODO(pthier): Argument offsets of bytecodes should be easily accessible by
+// name or at least by position.
+// TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32).
+#define BYTECODE_ITERATOR(V) \
+ V(BREAK, 0, 4) /* bc8 */ \
+ V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
+ V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
+ V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
+ V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
+ V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
+ V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
+ V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
+ V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
+ V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
+ V(POP_CP, 10, 4) /* bc8 pad24 */ \
+ V(POP_BT, 11, 4) /* bc8 pad24 */ \
+ V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
+ V(FAIL, 13, 4) /* bc8 pad24 */ \
+ V(SUCCEED, 14, 4) /* bc8 pad24 */ \
+ V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
+ /* Jump to another bytecode given its offset. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x10 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: 0x00 (unused) Padding */ \
+ /* 0x20 - 0x3F: Address of bytecode to jump to */ \
+ V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
+ /* Check if offset is in range and load character at given offset. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x11 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: Offset from current position */ \
+ /* 0x20 - 0x3F: Address of bytecode when load is out of range */ \
+ V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
+ /* Load character at given offset without range checks. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x12 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: Offset from current position */ \
+ V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
+ V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
+ V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
+ V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
+ V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
+ V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
+ /* Check if current character is equal to a given character */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x19 (fixed) Bytecode */ \
+ /* 0x08 - 0x0F: 0x00 (unused) Padding */ \
+ /* 0x10 - 0x1F: Character to check */ \
+ /* 0x20 - 0x3F: Address of bytecode when matched */ \
+ V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
+ V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
+ V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
+ V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
+ /* Checks if the current character combined with mask (bitwise and) */ \
+ /* matches a character (e.g. used when two characters in a disjunction */ \
+ /* differ by only a single bit */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x1c (fixed) Bytecode */ \
+ /* 0x08 - 0x0F: 0x00 (unused) Padding */ \
+ /* 0x10 - 0x1F: Character to match against (after mask aplied) */ \
+ /* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \
+ /* 0x40 - 0x5F: Address of bytecode when matched */ \
+ V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
+ V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
+ V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
+ V(MINUS_AND_CHECK_NOT_CHAR, 31, \
+ 12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */ \
+ V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
+ V(CHECK_CHAR_NOT_IN_RANGE, 33, \
+ 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
+ /* Checks if the current character matches any of the characters encoded */ \
+ /* in a bit table. Similar to/inspired by boyer moore string search */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x22 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: 0x00 (unused) Padding */ \
+ /* 0x20 - 0x3F: Address of bytecode when bit is set */ \
+ /* 0x40 - 0xBF: Bit table */ \
+ V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
+ V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32 */ \
+ V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32 */ \
+ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
+ V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
+ V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
+ V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
+ V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
+ V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
+ V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
+ /* Checks if the current position matches top of backtrack stack */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x31 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: 0x00 (unused) Padding */ \
+ /* 0x20 - 0x3F: Address of bytecode when current matches tos */ \
+ V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
+ /* Advance character pointer by given offset and jump to another bytecode.*/ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x32 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: Number of characters to advance */ \
+ /* 0x20 - 0x3F: Address of bytecode to jump to */ \
+ V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
+ V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ \
+ /* Checks if current position + given offset is in range. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07: 0x34 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F: Offset from current position */ \
+ /* 0x20 - 0x3F: Address of bytecode when position is out of range */ \
+ V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32 */ \
+ /* Combination of: */ \
+ /* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO */ \
+ /* Emitted by RegExpBytecodePeepholeOptimization. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07 0x35 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F Load character offset from current position */ \
+ /* 0x20 - 0x3F Number of characters to advance */ \
+ /* 0x40 - 0xBF Bit Table */ \
+ /* 0xC0 - 0xDF Address of bytecode when character is matched */ \
+ /* 0xE0 - 0xFF Address of bytecode when no match */ \
+ V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32) \
+ /* Combination of: */ \
+ /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR */ \
+ /* and ADVANCE_CP_AND_GOTO */ \
+ /* Emitted by RegExpBytecodePeepholeOptimization. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07 0x36 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F Load character offset from current position */ \
+ /* 0x20 - 0x2F Number of characters to advance */ \
+ /* 0x30 - 0x3F Character to match against (after mask applied) */ \
+ /* 0x40 - 0x5F: Bitmask bitwise and combined with current character */ \
+ /* 0x60 - 0x7F Minimum number of characters this pattern consumes */ \
+ /* 0x80 - 0x9F Address of bytecode when character is matched */ \
+ /* 0xA0 - 0xBF Address of bytecode when no match */ \
+ V(SKIP_UNTIL_CHAR_AND, 54, 24) \
+ /* Combination of: */ \
+ /* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
+ /* Emitted by RegExpBytecodePeepholeOptimization. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07 0x37 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F Load character offset from current position */ \
+ /* 0x20 - 0x2F Number of characters to advance */ \
+ /* 0x30 - 0x3F Character to match */ \
+ /* 0x40 - 0x5F Address of bytecode when character is matched */ \
+ /* 0x60 - 0x7F Address of bytecode when no match */ \
+ V(SKIP_UNTIL_CHAR, 55, 16) \
+ /* Combination of: */ \
+ /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR */ \
+ /* and ADVANCE_CP_AND_GOTO */ \
+ /* Emitted by RegExpBytecodePeepholeOptimization. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07 0x38 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F Load character offset from current position */ \
+ /* 0x20 - 0x2F Number of characters to advance */ \
+ /* 0x30 - 0x3F Character to match */ \
+ /* 0x40 - 0x5F Minimum number of characters this pattern consumes */ \
+ /* 0x60 - 0x7F Address of bytecode when character is matched */ \
+ /* 0x80 - 0x9F Address of bytecode when no match */ \
+ V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20) \
+ /* Combination of: */ \
+ /* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
+ /* Emitted by RegExpBytecodePeepholeOptimization. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07 0x39 (fixed) Bytecode */ \
+ /* 0x08 - 0x1F Load character offset from current position */ \
+ /* 0x20 - 0x3F Number of characters to advance */ \
+ /* 0x40 - 0x4F Character to match */ \
+ /* 0x50 - 0x5F Other Character to match */ \
+ /* 0x60 - 0x7F Address of bytecode when either character is matched */ \
+ /* 0x80 - 0x9F Address of bytecode when no match */ \
+ V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20) \
+ /* Combination of: */ \
+ /* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and */ \
+ /* and ADVANCE_CP_AND_GOTO */ \
+ /* Emitted by RegExpBytecodePeepholeOptimization. */ \
+ /* Bit Layout: */ \
+ /* 0x00 - 0x07 0x3A (fixed) Bytecode */ \
+ /* 0x08 - 0x1F Load character offset from current position */ \
+ /* 0x20 - 0x2F Number of characters to advance */ \
+ /* 0x30 - 0x3F Character to check if it is less than current char */ \
+ /* 0x40 - 0xBF Bit Table */ \
+ /* 0xC0 - 0xDF Address of bytecode when character is matched */ \
+ /* 0xE0 - 0xFF Address of bytecode when no match */ \
+ V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32)
+
+#define COUNT(...) +1
+static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
+#undef COUNT
+
+// Just making sure we assigned values above properly. They should be
+// contiguous, strictly increasing, and start at 0.
+// TODO(jgruber): Do not explicitly assign values, instead generate them
+// implicitly from the list order.
+static_assert(kRegExpBytecodeCount == 59);
+
+#define DECLARE_BYTECODES(name, code, length) \
+ static constexpr int BC_##name = code;
+BYTECODE_ITERATOR(DECLARE_BYTECODES)
+#undef DECLARE_BYTECODES
+
+static constexpr int kRegExpBytecodeLengths[] = {
+#define DECLARE_BYTECODE_LENGTH(name, code, length) length,
+ BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH)
+#undef DECLARE_BYTECODE_LENGTH
+};
+
+inline constexpr int RegExpBytecodeLength(int bytecode) {
+ DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
+ return kRegExpBytecodeLengths[bytecode];
+}
+
+static constexpr const char* const kRegExpBytecodeNames[] = {
+#define DECLARE_BYTECODE_NAME(name, ...) #name,
+ BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME)
+#undef DECLARE_BYTECODE_NAME
+};
+
+inline constexpr const char* RegExpBytecodeName(int bytecode) {
+ DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
+ return kRegExpBytecodeNames[bytecode];
+}
+
+void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc);
+void RegExpBytecodeDisassemble(const byte* code_base, int length,
+ const char* pattern);
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_BYTECODES_H_
diff --git a/js/src/irregexp/imported/regexp-compiler-tonode.cc b/js/src/irregexp/imported/regexp-compiler-tonode.cc
new file mode 100644
index 0000000000..8dc7ed629a
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-compiler-tonode.cc
@@ -0,0 +1,2042 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-compiler.h"
+
+#include "irregexp/imported/regexp.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "irregexp/imported/special-case.h"
+#include "unicode/locid.h"
+#include "unicode/uniset.h"
+#include "unicode/utypes.h"
+#endif // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
+
+constexpr base::uc32 kMaxCodePoint = 0x10ffff;
+constexpr int kMaxUtf16CodeUnit = 0xffff;
+constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
+constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
+
+// -------------------------------------------------------------------
+// Tree to graph conversion
+
+RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ ZoneList<TextElement>* elms =
+ compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
+ elms->Add(TextElement::Atom(this), compiler->zone());
+ return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
+ on_success);
+}
+
+RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
+ on_success);
+}
+
+namespace {
+
+bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
+ const int* special_class, int length) {
+ length--; // Remove final marker.
+
+ DCHECK_EQ(kRangeEndMarker, special_class[length]);
+ DCHECK_NE(0, ranges->length());
+ DCHECK_NE(0, length);
+ DCHECK_NE(0, special_class[0]);
+
+ if (ranges->length() != (length >> 1) + 1) return false;
+
+ CharacterRange range = ranges->at(0);
+ if (range.from() != 0) return false;
+
+ for (int i = 0; i < length; i += 2) {
+ if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
+ return false;
+ }
+ range = ranges->at((i >> 1) + 1);
+ if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
+ return false;
+ }
+ }
+
+ return range.to() == kMaxCodePoint;
+}
+
+bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
+ int length) {
+ length--; // Remove final marker.
+
+ DCHECK_EQ(kRangeEndMarker, special_class[length]);
+ if (ranges->length() * 2 != length) return false;
+
+ for (int i = 0; i < length; i += 2) {
+ CharacterRange range = ranges->at(i >> 1);
+ if (range.from() != static_cast<base::uc32>(special_class[i]) ||
+ range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace
+
+bool RegExpClassRanges::is_standard(Zone* zone) {
+ // TODO(lrn): Remove need for this function, by not throwing away information
+ // along the way.
+ if (is_negated()) {
+ return false;
+ }
+ if (set_.is_standard()) {
+ return true;
+ }
+ if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
+ return true;
+ }
+ if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
+ return true;
+ }
+ if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
+ kLineTerminatorRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
+ return true;
+ }
+ if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
+ kLineTerminatorRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
+ return true;
+ }
+ if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kWord);
+ return true;
+ }
+ if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
+ set_.set_standard_set_type(StandardCharacterSet::kNotWord);
+ return true;
+ }
+ return false;
+}
+
+UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
+ // The unicode range splitter categorizes given character ranges into:
+ // - Code points from the BMP representable by one code unit.
+ // - Code points outside the BMP that need to be split into
+ // surrogate pairs.
+ // - Lone lead surrogates.
+ // - Lone trail surrogates.
+ // Lone surrogates are valid code points, even though no actual characters.
+ // They require special matching to make sure we do not split surrogate pairs.
+
+ for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
+}
+
+void UnicodeRangeSplitter::AddRange(CharacterRange range) {
+ static constexpr base::uc32 kBmp1Start = 0;
+ static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
+ static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
+ static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
+
+ // Ends are all inclusive.
+ static_assert(kBmp1Start == 0);
+ static_assert(kBmp1Start < kBmp1End);
+ static_assert(kBmp1End + 1 == kLeadSurrogateStart);
+ static_assert(kLeadSurrogateStart < kLeadSurrogateEnd);
+ static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
+ static_assert(kTrailSurrogateStart < kTrailSurrogateEnd);
+ static_assert(kTrailSurrogateEnd + 1 == kBmp2Start);
+ static_assert(kBmp2Start < kBmp2End);
+ static_assert(kBmp2End + 1 == kNonBmpStart);
+ static_assert(kNonBmpStart < kNonBmpEnd);
+
+ static constexpr base::uc32 kStarts[] = {
+ kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
+ kBmp2Start, kNonBmpStart,
+ };
+
+ static constexpr base::uc32 kEnds[] = {
+ kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
+ };
+
+ CharacterRangeVector* const kTargets[] = {
+ &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
+ };
+
+ static constexpr int kCount = arraysize(kStarts);
+ static_assert(kCount == arraysize(kEnds));
+ static_assert(kCount == arraysize(kTargets));
+
+ for (int i = 0; i < kCount; i++) {
+ if (kStarts[i] > range.to()) break;
+ const base::uc32 from = std::max(kStarts[i], range.from());
+ const base::uc32 to = std::min(kEnds[i], range.to());
+ if (from > to) continue;
+ kTargets[i]->emplace_back(CharacterRange::Range(from, to));
+ }
+}
+
+namespace {
+
+// Translates between new and old V8-isms (SmallVector, ZoneList).
+ZoneList<CharacterRange>* ToCanonicalZoneList(
+ const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
+ if (v->empty()) return nullptr;
+
+ ZoneList<CharacterRange>* result =
+ zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
+ for (size_t i = 0; i < v->size(); i++) {
+ result->Add(v->at(i), zone);
+ }
+
+ CharacterRange::Canonicalize(result);
+ return result;
+}
+
+void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* bmp =
+ ToCanonicalZoneList(splitter->bmp(), compiler->zone());
+ if (bmp == nullptr) return;
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
+ compiler->zone(), bmp, compiler->read_backward(), on_success)));
+}
+
+using UC16Range = uint32_t; // {from, to} packed into one uint32_t.
+constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
+ return (static_cast<uint32_t>(from) << 16) | to;
+}
+constexpr base::uc16 ExtractFrom(UC16Range r) {
+ return static_cast<base::uc16>(r >> 16);
+}
+constexpr base::uc16 ExtractTo(UC16Range r) {
+ return static_cast<base::uc16>(r);
+}
+
+void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ DCHECK(!compiler->one_byte());
+ Zone* const zone = compiler->zone();
+ ZoneList<CharacterRange>* non_bmp =
+ ToCanonicalZoneList(splitter->non_bmp(), zone);
+ if (non_bmp == nullptr) return;
+
+ // Translate each 32-bit code point range into the corresponding 16-bit code
+ // unit representation consisting of the lead- and trail surrogate.
+ //
+ // The generated alternatives are grouped by the leading surrogate to avoid
+ // emitting excessive code. For example, for
+ //
+ // { \ud800[\udc00-\udc01]
+ // , \ud800[\udc05-\udc06]
+ // }
+ //
+ // there's no need to emit matching code for the leading surrogate \ud800
+ // twice. We also create a dedicated grouping for full trailing ranges, i.e.
+ // [dc00-dfff].
+ ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
+ zone);
+ ZoneList<CharacterRange>* leading_with_full_trailing_range =
+ zone->New<ZoneList<CharacterRange>>(1, zone);
+ const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
+ base::uc16 from_t, base::uc16 to_t) {
+ const UC16Range leading_range = ToUC16Range(from_l, to_l);
+ if (grouped_by_leading.count(leading_range) == 0) {
+ if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
+ leading_with_full_trailing_range->Add(
+ CharacterRange::Range(from_l, to_l), zone);
+ return;
+ }
+ grouped_by_leading[leading_range] =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ }
+ grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
+ zone);
+ };
+
+ // First, create the grouped ranges.
+ CharacterRange::Canonicalize(non_bmp);
+ for (int i = 0; i < non_bmp->length(); i++) {
+ // Match surrogate pair.
+ // E.g. [\u10005-\u11005] becomes
+ // \ud800[\udc05-\udfff]|
+ // [\ud801-\ud803][\udc00-\udfff]|
+ // \ud804[\udc00-\udc05]
+ base::uc32 from = non_bmp->at(i).from();
+ base::uc32 to = non_bmp->at(i).to();
+ base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
+ base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
+ base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
+ base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
+
+ if (from_l == to_l) {
+ // The lead surrogate is the same.
+ AddRange(from_l, to_l, from_t, to_t);
+ continue;
+ }
+
+ if (from_t != kTrailSurrogateStart) {
+ // Add [from_l][from_t-\udfff].
+ AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
+ from_l++;
+ }
+ if (to_t != kTrailSurrogateEnd) {
+ // Add [to_l][\udc00-to_t].
+ AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
+ to_l--;
+ }
+ if (from_l <= to_l) {
+ // Add [from_l-to_l][\udc00-\udfff].
+ AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
+ }
+ }
+
+ // Create the actual TextNode now that ranges are fully grouped.
+ if (!leading_with_full_trailing_range->is_empty()) {
+ CharacterRange::Canonicalize(leading_with_full_trailing_range);
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, leading_with_full_trailing_range,
+ CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
+ compiler->read_backward(), on_success)));
+ }
+ for (const auto& it : grouped_by_leading) {
+ CharacterRange leading_range =
+ CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
+ ZoneList<CharacterRange>* trailing_ranges = it.second;
+ CharacterRange::Canonicalize(trailing_ranges);
+ result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
+ zone, leading_range, trailing_ranges, compiler->read_backward(),
+ on_success)));
+ }
+}
+
+RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
+ RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
+ ZoneList<CharacterRange>* match, RegExpNode* on_success,
+ bool read_backward) {
+ Zone* zone = compiler->zone();
+ RegExpNode* match_node = TextNode::CreateForCharacterRanges(
+ zone, match, read_backward, on_success);
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpLookaround::Builder lookaround(false, match_node, stack_register,
+ position_register);
+ RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+ zone, lookbehind, !read_backward, lookaround.on_match_success());
+ return lookaround.ForMatch(negative_match);
+}
+
+RegExpNode* MatchAndNegativeLookaroundInReadDirection(
+ RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
+ ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
+ bool read_backward) {
+ Zone* zone = compiler->zone();
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ RegExpLookaround::Builder lookaround(false, on_success, stack_register,
+ position_register);
+ RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
+ zone, lookahead, read_backward, lookaround.on_match_success());
+ return TextNode::CreateForCharacterRanges(
+ zone, match, read_backward, lookaround.ForMatch(negative_match));
+}
+
+void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* lead_surrogates =
+ ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
+ if (lead_surrogates == nullptr) return;
+ Zone* zone = compiler->zone();
+ // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
+ ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+ RegExpNode* match;
+ if (compiler->read_backward()) {
+ // Reading backward. Assert that reading forward, there is no trail
+ // surrogate, and then backward match the lead surrogate.
+ match = NegativeLookaroundAgainstReadDirectionAndMatch(
+ compiler, trail_surrogates, lead_surrogates, on_success, true);
+ } else {
+ // Reading forward. Forward match the lead surrogate and assert that
+ // no trail surrogate follows.
+ match = MatchAndNegativeLookaroundInReadDirection(
+ compiler, lead_surrogates, trail_surrogates, on_success, false);
+ }
+ result->AddAlternative(GuardedAlternative(match));
+}
+
+void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
+ RegExpNode* on_success,
+ UnicodeRangeSplitter* splitter) {
+ ZoneList<CharacterRange>* trail_surrogates =
+ ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
+ if (trail_surrogates == nullptr) return;
+ Zone* zone = compiler->zone();
+ // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
+ ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+ zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+
+ RegExpNode* match;
+ if (compiler->read_backward()) {
+ // Reading backward. Backward match the trail surrogate and assert that no
+ // lead surrogate precedes it.
+ match = MatchAndNegativeLookaroundInReadDirection(
+ compiler, trail_surrogates, lead_surrogates, on_success, true);
+ } else {
+ // Reading forward. Assert that reading backward, there is no lead
+ // surrogate, and then forward match the trail surrogate.
+ match = NegativeLookaroundAgainstReadDirectionAndMatch(
+ compiler, lead_surrogates, trail_surrogates, on_success, false);
+ }
+ result->AddAlternative(GuardedAlternative(match));
+}
+
+RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
+ DCHECK(!compiler->read_backward());
+ Zone* zone = compiler->zone();
+ // Advance any character. If the character happens to be a lead surrogate and
+ // we advanced into the middle of a surrogate pair, it will work out, as
+ // nothing will match from there. We will have to advance again, consuming
+ // the associated trail surrogate.
+ ZoneList<CharacterRange>* range =
+ CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));
+ return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
+}
+
+} // namespace
+
+#ifdef V8_INTL_SUPPORT
+// static
+void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {
+ // Remove characters for which closeOver() adds full-case-folding equivalents
+ // because we should work only with simple case folding mappings.
+ icu::UnicodeSet non_simple = icu::UnicodeSet(set);
+ non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());
+ set.removeAll(non_simple);
+
+ set.closeOver(USET_CASE_INSENSITIVE);
+ // Full case folding maps single characters to multiple characters.
+ // Those are represented as strings in the set. Remove them so that
+ // we end up with only simple and common case mappings.
+ set.removeAllStrings();
+
+ // Add characters that have non-simple case foldings again (they match
+ // themselves).
+ set.addAll(non_simple);
+}
+#endif // V8_INTL_SUPPORT
+
+// static
+void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
+ Zone* zone) {
+#ifdef V8_INTL_SUPPORT
+ DCHECK(IsCanonical(ranges));
+
+ // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
+ // See also https://crbug.com/v8/6727.
+ // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
+ // which we use frequently internally. But large ranges can also easily be
+ // created by the user. We might want to have a more general caching mechanism
+ // for such ranges.
+ if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
+
+ // Use ICU to compute the case fold closure over the ranges.
+ icu::UnicodeSet set;
+ for (int i = 0; i < ranges->length(); i++) {
+ set.add(ranges->at(i).from(), ranges->at(i).to());
+ }
+ // Clear the ranges list without freeing the backing store.
+ ranges->Rewind(0);
+
+ UnicodeSimpleCloseOver(set);
+ for (int i = 0; i < set.getRangeCount(); i++) {
+ ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
+ }
+ // No errors and everything we collected have been ranges.
+ Canonicalize(ranges);
+#endif // V8_INTL_SUPPORT
+}
+
+RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ set_.Canonicalize();
+ Zone* const zone = compiler->zone();
+ ZoneList<CharacterRange>* ranges = this->ranges(zone);
+
+ if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
+ CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);
+ }
+
+ if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||
+ contains_split_surrogate()) {
+ return zone->New<TextNode>(this, compiler->read_backward(), on_success);
+ }
+
+ if (is_negated()) {
+ // With /v, character classes are never negated.
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom
+ // Atom :: CharacterClass
+ // 4. Assert: cc.[[Invert]] is false.
+ // Instead the complement is created when evaluating the class set.
+ // The only exception is the "nothing range" (negated everything), which is
+ // internally created for an empty set.
+ DCHECK_IMPLIES(
+ IsUnicodeSets(compiler->flags()),
+ ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));
+ ZoneList<CharacterRange>* negated =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::Negate(ranges, negated, zone);
+ ranges = negated;
+ }
+
+ if (ranges->length() == 0) {
+ // The empty character class is used as a 'fail' node.
+ RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges);
+ return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
+ }
+
+ if (set_.is_standard() &&
+ standard_type() == StandardCharacterSet::kEverything) {
+ return UnanchoredAdvance(compiler, on_success);
+ }
+
+ // Split ranges in order to handle surrogates correctly:
+ // - Surrogate pairs: translate the 32-bit code point into two uc16 code
+ // units (irregexp operates only on code units).
+ // - Lone surrogates: these require lookarounds to ensure we don't match in
+ // the middle of a surrogate pair.
+ ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
+ UnicodeRangeSplitter splitter(ranges);
+ AddBmpCharacters(compiler, result, on_success, &splitter);
+ AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
+ AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
+ AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
+
+ static constexpr int kMaxRangesToInline = 32; // Arbitrary.
+ if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
+
+ return result;
+}
+
+RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ Zone* zone = compiler->zone();
+ const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) +
+ (ranges()->is_empty() ? 0 : 1);
+ if (size == 0) {
+ // If neither ranges nor strings are present, the operand is equal to an
+ // empty range (matching nothing).
+ ZoneList<CharacterRange>* empty =
+ zone->template New<ZoneList<CharacterRange>>(0, zone);
+ return zone->template New<RegExpClassRanges>(zone, empty)
+ ->ToNode(compiler, on_success);
+ }
+ ZoneList<RegExpTree*>* alternatives =
+ zone->template New<ZoneList<RegExpTree*>>(size, zone);
+ // Strings are sorted by length first (larger strings before shorter ones).
+ // See the comment on CharacterClassStrings.
+ // Empty strings (if present) are added after character ranges.
+ RegExpTree* empty_string = nullptr;
+ if (has_strings()) {
+ for (auto string : *strings()) {
+ if (string.second->IsEmpty()) {
+ empty_string = string.second;
+ } else {
+ alternatives->Add(string.second, zone);
+ }
+ }
+ }
+ if (!ranges()->is_empty()) {
+ alternatives->Add(zone->template New<RegExpClassRanges>(zone, ranges()),
+ zone);
+ }
+ if (empty_string != nullptr) {
+ alternatives->Add(empty_string, zone);
+ }
+
+ RegExpTree* node = nullptr;
+ if (size == 1) {
+ DCHECK_EQ(alternatives->length(), 1);
+ node = alternatives->first();
+ } else {
+ node = zone->template New<RegExpDisjunction>(alternatives);
+ }
+ return node->ToNode(compiler, on_success);
+}
+
+RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* temp_ranges =
+ zone->template New<ZoneList<CharacterRange>>(4, zone);
+ RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone);
+ return root->ToNode(compiler, on_success);
+}
+
+void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {
+ ranges()->AddAll(*other->ranges(), zone);
+ if (other->has_strings()) {
+ if (strings_ == nullptr) {
+ strings_ = zone->template New<CharacterClassStrings>(zone);
+ }
+ strings()->insert(other->strings()->begin(), other->strings()->end());
+ }
+}
+
+void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,
+ ZoneList<CharacterRange>* temp_ranges,
+ Zone* zone) {
+ CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
+ std::swap(*ranges(), *temp_ranges);
+ temp_ranges->Rewind(0);
+ if (has_strings()) {
+ if (!other->has_strings()) {
+ strings()->clear();
+ } else {
+ for (auto iter = strings()->begin(); iter != strings()->end();) {
+ if (other->strings()->find(iter->first) == other->strings()->end()) {
+ iter = strings()->erase(iter);
+ } else {
+ iter++;
+ }
+ }
+ }
+ }
+}
+
+void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,
+ ZoneList<CharacterRange>* temp_ranges,
+ Zone* zone) {
+ CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
+ std::swap(*ranges(), *temp_ranges);
+ temp_ranges->Rewind(0);
+ if (has_strings() && other->has_strings()) {
+ for (auto iter = strings()->begin(); iter != strings()->end();) {
+ if (other->strings()->find(iter->first) != other->strings()->end()) {
+ iter = strings()->erase(iter);
+ } else {
+ iter++;
+ }
+ }
+ }
+}
+
+// static
+RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(
+ RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
+ DCHECK(temp_ranges->is_empty());
+ if (root->IsClassSetOperand()) {
+ return root->AsClassSetOperand();
+ }
+ DCHECK(root->IsClassSetExpression());
+ RegExpClassSetExpression* node = root->AsClassSetExpression();
+ RegExpClassSetOperand* result =
+ ComputeExpression(node->operands()->at(0), temp_ranges, zone);
+ switch (node->operation()) {
+ case OperationType::kUnion: {
+ for (int i = 1; i < node->operands()->length(); i++) {
+ RegExpClassSetOperand* op =
+ ComputeExpression(node->operands()->at(i), temp_ranges, zone);
+ result->Union(op, zone);
+ }
+ CharacterRange::Canonicalize(result->ranges());
+ break;
+ }
+ case OperationType::kIntersection: {
+ for (int i = 1; i < node->operands()->length(); i++) {
+ RegExpClassSetOperand* op =
+ ComputeExpression(node->operands()->at(i), temp_ranges, zone);
+ result->Intersect(op, temp_ranges, zone);
+ }
+ break;
+ }
+ case OperationType::kSubtraction: {
+ for (int i = 1; i < node->operands()->length(); i++) {
+ RegExpClassSetOperand* op =
+ ComputeExpression(node->operands()->at(i), temp_ranges, zone);
+ result->Subtract(op, temp_ranges, zone);
+ }
+ break;
+ }
+ }
+ if (node->is_negated()) {
+ DCHECK(!result->has_strings());
+ CharacterRange::Negate(result->ranges(), temp_ranges, zone);
+ std::swap(*result->ranges(), *temp_ranges);
+ temp_ranges->Rewind(0);
+ }
+ // Store the result as single operand of the current node.
+ node->operands()->Set(0, result);
+ node->operands()->Rewind(1);
+
+ return result;
+}
+
+namespace {
+
+int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ base::uc16 character1 = atom1->data().at(0);
+ base::uc16 character2 = atom2->data().at(0);
+ if (character1 < character2) return -1;
+ if (character1 > character2) return 1;
+ return 0;
+}
+
+#ifdef V8_INTL_SUPPORT
+
+int CompareCaseInsensitive(const icu::UnicodeString& a,
+ const icu::UnicodeString& b) {
+ return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
+}
+
+int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
+ RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
+ icu::UnicodeString{atom2->data().at(0)});
+}
+
+bool Equals(bool ignore_case, const icu::UnicodeString& a,
+ const icu::UnicodeString& b) {
+ if (a == b) return true;
+ if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
+ return false; // Case-sensitive equality already checked above.
+}
+
+bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
+ const RegExpAtom* b) {
+ return Equals(ignore_case, a->data().at(index), b->data().at(index));
+}
+
+#else
+
+unibrow::uchar Canonical(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar c) {
+ unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
+ int length = canonicalize->get(c, '\0', chars);
+ DCHECK_LE(length, 1);
+ unibrow::uchar canonical = c;
+ if (length == 1) canonical = chars[0];
+ return canonical;
+}
+
+int CompareCaseInsensitive(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar a, unibrow::uchar b) {
+ if (a == b) return 0;
+ if (a >= 'a' || b >= 'a') {
+ a = Canonical(canonicalize, a);
+ b = Canonical(canonicalize, b);
+ }
+ return static_cast<int>(a) - static_cast<int>(b);
+}
+
+int CompareFirstCharCaseInsensitive(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ RegExpTree* const* a, RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
+ atom2->data().at(0));
+}
+
+bool Equals(bool ignore_case,
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar a, unibrow::uchar b) {
+ if (a == b) return true;
+ if (ignore_case) {
+ return CompareCaseInsensitive(canonicalize, a, b) == 0;
+ }
+ return false; // Case-sensitive equality already checked above.
+}
+
+bool CharAtEquals(bool ignore_case,
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ int index, const RegExpAtom* a, const RegExpAtom* b) {
+ return Equals(ignore_case, canonicalize, a->data().at(index),
+ b->data().at(index));
+}
+
+#endif // V8_INTL_SUPPORT
+
+} // namespace
+
+// We can stable sort runs of atoms, since the order does not matter if they
+// start with different characters.
+// Returns true if any consecutive atoms were found.
+bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+ bool found_consecutive_atoms = false;
+ for (int i = 0; i < length; i++) {
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (alternative->IsAtom()) break;
+ i++;
+ }
+ // i is length or it is the index of an atom.
+ if (i == length) break;
+ int first_atom = i;
+ i++;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ i++;
+ }
+ // Sort atoms to get ones with common prefixes together.
+ // This step is more tricky if we are in a case-independent regexp,
+ // because it would change /is|I/ to /I|is/, and order matters when
+ // the regexp parts don't match only disjoint starting points. To fix
+ // this we have a version of CompareFirstChar that uses case-
+ // independent character classes for comparison.
+ DCHECK_LT(first_atom, alternatives->length());
+ DCHECK_LE(i, alternatives->length());
+ DCHECK_LE(first_atom, i);
+ if (IsIgnoreCase(compiler->flags())) {
+#ifdef V8_INTL_SUPPORT
+ alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
+ i - first_atom);
+#else
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+ compiler->isolate()->regexp_macro_assembler_canonicalize();
+ auto compare_closure = [canonicalize](RegExpTree* const* a,
+ RegExpTree* const* b) {
+ return CompareFirstCharCaseInsensitive(canonicalize, a, b);
+ };
+ alternatives->StableSort(compare_closure, first_atom, i - first_atom);
+#endif // V8_INTL_SUPPORT
+ } else {
+ alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
+ }
+ if (i - first_atom > 1) found_consecutive_atoms = true;
+ }
+ return found_consecutive_atoms;
+}
+
+// Optimizes ab|ac|az to a(?:b|c|d).
+void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
+ Zone* zone = compiler->zone();
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+ const bool ignore_case = IsIgnoreCase(compiler->flags());
+
+ int write_posn = 0;
+ int i = 0;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ RegExpAtom* const atom = alternative->AsAtom();
+#ifdef V8_INTL_SUPPORT
+ icu::UnicodeString common_prefix(atom->data().at(0));
+#else
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
+ compiler->isolate()->regexp_macro_assembler_canonicalize();
+ unibrow::uchar common_prefix = atom->data().at(0);
+ if (ignore_case) {
+ common_prefix = Canonical(canonicalize, common_prefix);
+ }
+#endif // V8_INTL_SUPPORT
+ int first_with_prefix = i;
+ int prefix_length = atom->length();
+ i++;
+ while (i < length) {
+ alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ RegExpAtom* const alt_atom = alternative->AsAtom();
+#ifdef V8_INTL_SUPPORT
+ icu::UnicodeString new_prefix(alt_atom->data().at(0));
+ if (!Equals(ignore_case, new_prefix, common_prefix)) break;
+#else
+ unibrow::uchar new_prefix = alt_atom->data().at(0);
+ if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
+#endif // V8_INTL_SUPPORT
+ prefix_length = std::min(prefix_length, alt_atom->length());
+ i++;
+ }
+ if (i > first_with_prefix + 2) {
+ // Found worthwhile run of alternatives with common prefix of at least one
+ // character. The sorting function above did not sort on more than one
+ // character for reasons of correctness, but there may still be a longer
+ // common prefix if the terms were similar or presorted in the input.
+ // Find out how long the common prefix is.
+ int run_length = i - first_with_prefix;
+ RegExpAtom* const alt_atom =
+ alternatives->at(first_with_prefix)->AsAtom();
+ for (int j = 1; j < run_length && prefix_length > 1; j++) {
+ RegExpAtom* old_atom =
+ alternatives->at(j + first_with_prefix)->AsAtom();
+ for (int k = 1; k < prefix_length; k++) {
+#ifdef V8_INTL_SUPPORT
+ if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
+#else
+ if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
+#endif // V8_INTL_SUPPORT
+ prefix_length = k;
+ break;
+ }
+ }
+ }
+ RegExpAtom* prefix =
+ zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length));
+ ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
+ pair->Add(prefix, zone);
+ ZoneList<RegExpTree*>* suffixes =
+ zone->New<ZoneList<RegExpTree*>>(run_length, zone);
+ for (int j = 0; j < run_length; j++) {
+ RegExpAtom* old_atom =
+ alternatives->at(j + first_with_prefix)->AsAtom();
+ int len = old_atom->length();
+ if (len == prefix_length) {
+ suffixes->Add(zone->New<RegExpEmpty>(), zone);
+ } else {
+ RegExpTree* suffix = zone->New<RegExpAtom>(
+ old_atom->data().SubVector(prefix_length, old_atom->length()));
+ suffixes->Add(suffix, zone);
+ }
+ }
+ pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
+ alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
+ } else {
+ // Just copy any non-worthwhile alternatives.
+ for (int j = first_with_prefix; j < i; j++) {
+ alternatives->at(write_posn++) = alternatives->at(j);
+ }
+ }
+ }
+ alternatives->Rewind(write_posn); // Trim end of array.
+}
+
+// Optimizes b|c|z to [bcz].
+void RegExpDisjunction::FixSingleCharacterDisjunctions(
+ RegExpCompiler* compiler) {
+ Zone* zone = compiler->zone();
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+
+ int write_posn = 0;
+ int i = 0;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ RegExpAtom* const atom = alternative->AsAtom();
+ if (atom->length() != 1) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ const RegExpFlags flags = compiler->flags();
+ DCHECK_IMPLIES(IsEitherUnicode(flags),
+ !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
+ bool contains_trail_surrogate =
+ unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
+ int first_in_run = i;
+ i++;
+ // Find a run of single-character atom alternatives that have identical
+ // flags (case independence and unicode-ness).
+ while (i < length) {
+ alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ RegExpAtom* const alt_atom = alternative->AsAtom();
+ if (alt_atom->length() != 1) break;
+ DCHECK_IMPLIES(IsEitherUnicode(flags),
+ !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
+ contains_trail_surrogate |=
+ unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));
+ i++;
+ }
+ if (i > first_in_run + 1) {
+ // Found non-trivial run of single-character alternatives.
+ int run_length = i - first_in_run;
+ ZoneList<CharacterRange>* ranges =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ for (int j = 0; j < run_length; j++) {
+ RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
+ DCHECK_EQ(old_atom->length(), 1);
+ ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
+ }
+ RegExpClassRanges::ClassRangesFlags class_ranges_flags;
+ if (IsEitherUnicode(flags) && contains_trail_surrogate) {
+ class_ranges_flags = RegExpClassRanges::CONTAINS_SPLIT_SURROGATE;
+ }
+ alternatives->at(write_posn++) =
+ zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags);
+ } else {
+ // Just copy any trivial alternatives.
+ for (int j = first_in_run; j < i; j++) {
+ alternatives->at(write_posn++) = alternatives->at(j);
+ }
+ }
+ }
+ alternatives->Rewind(write_posn); // Trim end of array.
+}
+
+RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ compiler->ToNodeMaybeCheckForStackOverflow();
+
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+
+ if (alternatives->length() > 2) {
+ bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
+ if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
+ FixSingleCharacterDisjunctions(compiler);
+ if (alternatives->length() == 1) {
+ return alternatives->at(0)->ToNode(compiler, on_success);
+ }
+ }
+
+ int length = alternatives->length();
+
+ ChoiceNode* result =
+ compiler->zone()->New<ChoiceNode>(length, compiler->zone());
+ for (int i = 0; i < length; i++) {
+ GuardedAlternative alternative(
+ alternatives->at(i)->ToNode(compiler, on_success));
+ result->AddAlternative(alternative);
+ }
+ return result;
+}
+
+RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
+}
+
+namespace {
+// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
+// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
+RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpAssertion::Type type,
+ RegExpFlags flags) {
+ CHECK(NeedsUnicodeCaseEquivalents(flags));
+ Zone* zone = compiler->zone();
+ ZoneList<CharacterRange>* word_range =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
+ zone);
+ int stack_register = compiler->UnicodeLookaroundStackRegister();
+ int position_register = compiler->UnicodeLookaroundPositionRegister();
+ ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
+ // Add two choices. The (non-)boundary could start with a word or
+ // a non-word-character.
+ for (int i = 0; i < 2; i++) {
+ bool lookbehind_for_word = i == 0;
+ bool lookahead_for_word =
+ (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
+ // Look to the left.
+ RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
+ stack_register, position_register);
+ RegExpNode* backward = TextNode::CreateForCharacterRanges(
+ zone, word_range, true, lookbehind.on_match_success());
+ // Look to the right.
+ RegExpLookaround::Builder lookahead(lookahead_for_word,
+ lookbehind.ForMatch(backward),
+ stack_register, position_register);
+ RegExpNode* forward = TextNode::CreateForCharacterRanges(
+ zone, word_range, false, lookahead.on_match_success());
+ result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
+ }
+ return result;
+}
+} // anonymous namespace
+
+RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ NodeInfo info;
+ Zone* zone = compiler->zone();
+
+ switch (assertion_type()) {
+ case Type::START_OF_LINE:
+ return AssertionNode::AfterNewline(on_success);
+ case Type::START_OF_INPUT:
+ return AssertionNode::AtStart(on_success);
+ case Type::BOUNDARY:
+ return NeedsUnicodeCaseEquivalents(compiler->flags())
+ ? BoundaryAssertionAsLookaround(
+ compiler, on_success, Type::BOUNDARY, compiler->flags())
+ : AssertionNode::AtBoundary(on_success);
+ case Type::NON_BOUNDARY:
+ return NeedsUnicodeCaseEquivalents(compiler->flags())
+ ? BoundaryAssertionAsLookaround(compiler, on_success,
+ Type::NON_BOUNDARY,
+ compiler->flags())
+ : AssertionNode::AtNonBoundary(on_success);
+ case Type::END_OF_INPUT:
+ return AssertionNode::AtEnd(on_success);
+ case Type::END_OF_LINE: {
+ // Compile $ in multiline regexps as an alternation with a positive
+ // lookahead in one side and an end-of-input on the other side.
+ // We need two registers for the lookahead.
+ int stack_pointer_register = compiler->AllocateRegister();
+ int position_register = compiler->AllocateRegister();
+ // The ChoiceNode to distinguish between a newline and end-of-input.
+ ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
+ // Create a newline atom.
+ ZoneList<CharacterRange>* newline_ranges =
+ zone->New<ZoneList<CharacterRange>>(3, zone);
+ CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
+ newline_ranges, false, zone);
+ RegExpClassRanges* newline_atom =
+ zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator);
+ TextNode* newline_matcher =
+ zone->New<TextNode>(newline_atom, false,
+ ActionNode::PositiveSubmatchSuccess(
+ stack_pointer_register, position_register,
+ 0, // No captures inside.
+ -1, // Ignored if no captures.
+ on_success));
+ // Create an end-of-input matcher.
+ RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
+ stack_pointer_register, position_register, newline_matcher);
+ // Add the two alternatives to the ChoiceNode.
+ GuardedAlternative eol_alternative(end_of_line);
+ result->AddAlternative(eol_alternative);
+ GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
+ result->AddAlternative(end_alternative);
+ return result;
+ }
+ default:
+ UNREACHABLE();
+ }
+}
+
+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return compiler->zone()->New<BackReferenceNode>(
+ RegExpCapture::StartRegister(index()),
+ RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
+ on_success);
+}
+
+RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return on_success;
+}
+
+RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return body_->ToNode(compiler, on_success);
+}
+
+RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
+ int stack_pointer_register,
+ int position_register,
+ int capture_register_count,
+ int capture_register_start)
+ : is_positive_(is_positive),
+ on_success_(on_success),
+ stack_pointer_register_(stack_pointer_register),
+ position_register_(position_register) {
+ if (is_positive_) {
+ on_match_success_ = ActionNode::PositiveSubmatchSuccess(
+ stack_pointer_register, position_register, capture_register_count,
+ capture_register_start, on_success_);
+ } else {
+ Zone* zone = on_success_->zone();
+ on_match_success_ = zone->New<NegativeSubmatchSuccess>(
+ stack_pointer_register, position_register, capture_register_count,
+ capture_register_start, zone);
+ }
+}
+
+RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
+ if (is_positive_) {
+ return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
+ position_register_, match);
+ } else {
+ Zone* zone = on_success_->zone();
+ // We use a ChoiceNode to represent the negative lookaround. The first
+ // alternative is the negative match. On success, the end node backtracks.
+ // On failure, the second alternative is tried and leads to success.
+ // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
+ // first exit when calculating quick checks.
+ ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
+ GuardedAlternative(match), GuardedAlternative(on_success_), zone);
+ return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
+ position_register_, choice_node);
+ }
+}
+
+RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ int stack_pointer_register = compiler->AllocateRegister();
+ int position_register = compiler->AllocateRegister();
+
+ const int registers_per_capture = 2;
+ const int register_of_first_capture = 2;
+ int register_count = capture_count_ * registers_per_capture;
+ int register_start =
+ register_of_first_capture + capture_from_ * registers_per_capture;
+
+ RegExpNode* result;
+ bool was_reading_backward = compiler->read_backward();
+ compiler->set_read_backward(type() == LOOKBEHIND);
+ Builder builder(is_positive(), on_success, stack_pointer_register,
+ position_register, register_count, register_start);
+ RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
+ result = builder.ForMatch(match);
+ compiler->set_read_backward(was_reading_backward);
+ return result;
+}
+
+RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ return ToNode(body(), index(), compiler, on_success);
+}
+
+RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
+ RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ DCHECK_NOT_NULL(body);
+ int start_reg = RegExpCapture::StartRegister(index);
+ int end_reg = RegExpCapture::EndRegister(index);
+ if (compiler->read_backward()) std::swap(start_reg, end_reg);
+ RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
+ RegExpNode* body_node = body->ToNode(compiler, store_end);
+ return ActionNode::StorePosition(start_reg, true, body_node);
+}
+
+namespace {
+
+class AssertionSequenceRewriter final {
+ public:
+ // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
+ // instead of sprinkling rewrites into the AST->Node conversion process.
+ static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
+ AssertionSequenceRewriter rewriter(terms, zone);
+
+ static constexpr int kNoIndex = -1;
+ int from = kNoIndex;
+
+ for (int i = 0; i < terms->length(); i++) {
+ RegExpTree* t = terms->at(i);
+ if (from == kNoIndex && t->IsAssertion()) {
+ from = i; // Start a sequence.
+ } else if (from != kNoIndex && !t->IsAssertion()) {
+ // Terminate and process the sequence.
+ if (i - from > 1) rewriter.Rewrite(from, i);
+ from = kNoIndex;
+ }
+ }
+
+ if (from != kNoIndex && terms->length() - from > 1) {
+ rewriter.Rewrite(from, terms->length());
+ }
+ }
+
+ // All assertions are zero width. A consecutive sequence of assertions is
+ // order-independent. There's two ways we can optimize here:
+ // 1. fold all identical assertions.
+ // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
+ // sequence fails.
+ void Rewrite(int from, int to) {
+ DCHECK_GT(to, from + 1);
+
+ // Bitfield of all seen assertions.
+ uint32_t seen_assertions = 0;
+ static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
+ kUInt32Size * kBitsPerByte);
+
+ for (int i = from; i < to; i++) {
+ RegExpAssertion* t = terms_->at(i)->AsAssertion();
+ const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
+
+ if (seen_assertions & bit) {
+ // Fold duplicates.
+ terms_->Set(i, zone_->New<RegExpEmpty>());
+ }
+
+ seen_assertions |= bit;
+ }
+
+ // Collapse failures.
+ const uint32_t always_fails_mask =
+ 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
+ 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
+ if ((seen_assertions & always_fails_mask) == always_fails_mask) {
+ ReplaceSequenceWithFailure(from, to);
+ }
+ }
+
+ void ReplaceSequenceWithFailure(int from, int to) {
+ // Replace the entire sequence with a single node that always fails.
+ // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
+ // negated '*' (everything) range serves the purpose.
+ ZoneList<CharacterRange>* ranges =
+ zone_->New<ZoneList<CharacterRange>>(0, zone_);
+ RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges);
+ terms_->Set(from, cc);
+
+ // Zero out the rest.
+ RegExpEmpty* empty = zone_->New<RegExpEmpty>();
+ for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
+ }
+
+ private:
+ AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
+ : zone_(zone), terms_(terms) {}
+
+ Zone* zone_;
+ ZoneList<RegExpTree*>* terms_;
+};
+
+} // namespace
+
+RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
+ compiler->ToNodeMaybeCheckForStackOverflow();
+
+ ZoneList<RegExpTree*>* children = nodes();
+
+ AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
+
+ RegExpNode* current = on_success;
+ if (compiler->read_backward()) {
+ for (int i = 0; i < children->length(); i++) {
+ current = children->at(i)->ToNode(compiler, current);
+ }
+ } else {
+ for (int i = children->length() - 1; i >= 0; i--) {
+ current = children->at(i)->ToNode(compiler, current);
+ }
+ }
+ return current;
+}
+
+namespace {
+
+void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
+ Zone* zone) {
+ elmc--;
+ DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
+ for (int i = 0; i < elmc; i += 2) {
+ DCHECK(elmv[i] < elmv[i + 1]);
+ ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
+ }
+}
+
+void AddClassNegated(const int* elmv, int elmc,
+ ZoneList<CharacterRange>* ranges, Zone* zone) {
+ elmc--;
+ DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
+ DCHECK_NE(0x0000, elmv[0]);
+ DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
+ base::uc16 last = 0x0000;
+ for (int i = 0; i < elmc; i += 2) {
+ DCHECK(last <= elmv[i] - 1);
+ DCHECK(elmv[i] < elmv[i + 1]);
+ ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
+ last = elmv[i + 1];
+ }
+ ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
+}
+
+} // namespace
+
+void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
+ ZoneList<CharacterRange>* ranges,
+ bool add_unicode_case_equivalents,
+ Zone* zone) {
+ if (add_unicode_case_equivalents &&
+ (standard_character_set == StandardCharacterSet::kWord ||
+ standard_character_set == StandardCharacterSet::kNotWord)) {
+ // See #sec-runtime-semantics-wordcharacters-abstract-operation
+ // In case of unicode and ignore_case, we need to create the closure over
+ // case equivalent characters before negating.
+ ZoneList<CharacterRange>* new_ranges =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
+ AddUnicodeCaseEquivalents(new_ranges, zone);
+ if (standard_character_set == StandardCharacterSet::kNotWord) {
+ ZoneList<CharacterRange>* negated =
+ zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::Negate(new_ranges, negated, zone);
+ new_ranges = negated;
+ }
+ ranges->AddAll(*new_ranges, zone);
+ return;
+ }
+
+ switch (standard_character_set) {
+ case StandardCharacterSet::kWhitespace:
+ AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotWhitespace:
+ AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kWord:
+ AddClass(kWordRanges, kWordRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotWord:
+ AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kDigit:
+ AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotDigit:
+ AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
+ break;
+ // This is the set of characters matched by the $ and ^ symbols
+ // in multiline mode.
+ case StandardCharacterSet::kLineTerminator:
+ AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
+ break;
+ case StandardCharacterSet::kNotLineTerminator:
+ AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
+ zone);
+ break;
+ // This is not a character range as defined by the spec but a
+ // convenient shorthand for a character class that matches any
+ // character.
+ case StandardCharacterSet::kEverything:
+ ranges->Add(CharacterRange::Everything(), zone);
+ break;
+ }
+}
+
+// static
+void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool is_one_byte) {
+ CharacterRange::Canonicalize(ranges);
+ int range_count = ranges->length();
+#ifdef V8_INTL_SUPPORT
+ icu::UnicodeSet others;
+ for (int i = 0; i < range_count; i++) {
+ CharacterRange range = ranges->at(i);
+ base::uc32 from = range.from();
+ if (from > kMaxUtf16CodeUnit) continue;
+ base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
+ // Nothing to be done for surrogates.
+ if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
+ if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
+ if (from > kMaxOneByteCharCode) continue;
+ if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode;
+ }
+ others.add(from, to);
+ }
+
+ // Compute the set of additional characters that should be added,
+ // using UnicodeSet::closeOver. ECMA 262 defines slightly different
+ // case-folding rules than Unicode, so some characters that are
+ // added by closeOver do not match anything other than themselves in
+ // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
+ // same case-insensitive character as 's' or 'S' according to
+ // Unicode, but does not match any other character in JS. To handle
+ // this case, we add such characters to the IgnoreSet and filter
+ // them out. We filter twice: once before calling closeOver (to
+ // prevent 'ſ' from adding 's'), and once after calling closeOver
+ // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
+ // more information.
+ icu::UnicodeSet already_added(others);
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
+ others.closeOver(USET_CASE_INSENSITIVE);
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
+ others.removeAll(already_added);
+
+ // Add others to the ranges
+ for (int32_t i = 0; i < others.getRangeCount(); i++) {
+ UChar32 from = others.getRangeStart(i);
+ UChar32 to = others.getRangeEnd(i);
+ if (from == to) {
+ ranges->Add(CharacterRange::Singleton(from), zone);
+ } else {
+ ranges->Add(CharacterRange::Range(from, to), zone);
+ }
+ }
+#else
+ for (int i = 0; i < range_count; i++) {
+ CharacterRange range = ranges->at(i);
+ base::uc32 bottom = range.from();
+ if (bottom > kMaxUtf16CodeUnit) continue;
+ base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
+ // Nothing to be done for surrogates.
+ if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
+ if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
+ if (bottom > kMaxOneByteCharCode) continue;
+ if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode;
+ }
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ if (top == bottom) {
+ // If this is a singleton we just expand the one character.
+ int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
+ for (int i = 0; i < length; i++) {
+ base::uc32 chr = chars[i];
+ if (chr != bottom) {
+ ranges->Add(CharacterRange::Singleton(chars[i]), zone);
+ }
+ }
+ } else {
+ // If this is a range we expand the characters block by block, expanding
+ // contiguous subranges (blocks) one at a time. The approach is as
+ // follows. For a given start character we look up the remainder of the
+ // block that contains it (represented by the end point), for instance we
+ // find 'z' if the character is 'c'. A block is characterized by the
+ // property that all characters uncanonicalize in the same way, except
+ // that each entry in the result is incremented by the distance from the
+ // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
+ // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
+ // we've found the end point we look up its uncanonicalization and
+ // produce a range for each element. For instance for [c-f] we look up
+ // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
+ // it is not already contained in the input, so [c-f] will be skipped but
+ // [C-F] will be added. If this range is not completely contained in a
+ // block we do this for all the blocks covered by the range (handling
+ // characters that is not in a block as a "singleton block").
+ unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ base::uc32 pos = bottom;
+ while (pos <= top) {
+ int length =
+ isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
+ base::uc32 block_end;
+ if (length == 0) {
+ block_end = pos;
+ } else {
+ DCHECK_EQ(1, length);
+ block_end = equivalents[0];
+ }
+ int end = (block_end > top) ? top : block_end;
+ length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
+ equivalents);
+ for (int i = 0; i < length; i++) {
+ base::uc32 c = equivalents[i];
+ base::uc32 range_from = c - (block_end - pos);
+ base::uc32 range_to = c - (block_end - end);
+ if (!(bottom <= range_from && range_to <= top)) {
+ ranges->Add(CharacterRange::Range(range_from, range_to), zone);
+ }
+ }
+ pos = end + 1;
+ }
+ }
+ }
+#endif // V8_INTL_SUPPORT
+}
+
+bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) {
+ DCHECK_NOT_NULL(ranges);
+ int n = ranges->length();
+ if (n <= 1) return true;
+ base::uc32 max = ranges->at(0).to();
+ for (int i = 1; i < n; i++) {
+ CharacterRange next_range = ranges->at(i);
+ if (next_range.from() <= max + 1) return false;
+ max = next_range.to();
+ }
+ return true;
+}
+
+ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
+ if (ranges_ == nullptr) {
+ ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
+ CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
+ zone);
+ }
+ return ranges_;
+}
+
+namespace {
+
+// Move a number of elements in a zonelist to another position
+// in the same list. Handles overlapping source and target areas.
+void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
+ // Ranges are potentially overlapping.
+ if (from < to) {
+ for (int i = count - 1; i >= 0; i--) {
+ list->at(to + i) = list->at(from + i);
+ }
+ } else {
+ for (int i = 0; i < count; i++) {
+ list->at(to + i) = list->at(from + i);
+ }
+ }
+}
+
+int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
+ CharacterRange insert) {
+ // Inserts a range into list[0..count[, which must be sorted
+ // by from value and non-overlapping and non-adjacent, using at most
+ // list[0..count] for the result. Returns the number of resulting
+ // canonicalized ranges. Inserting a range may collapse existing ranges into
+ // fewer ranges, so the return value can be anything in the range 1..count+1.
+ base::uc32 from = insert.from();
+ base::uc32 to = insert.to();
+ int start_pos = 0;
+ int end_pos = count;
+ for (int i = count - 1; i >= 0; i--) {
+ CharacterRange current = list->at(i);
+ if (current.from() > to + 1) {
+ end_pos = i;
+ } else if (current.to() + 1 < from) {
+ start_pos = i + 1;
+ break;
+ }
+ }
+
+ // Inserted range overlaps, or is adjacent to, ranges at positions
+ // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
+ // not affected by the insertion.
+ // If start_pos == end_pos, the range must be inserted before start_pos.
+ // if start_pos < end_pos, the entire range from start_pos to end_pos
+ // must be merged with the insert range.
+
+ if (start_pos == end_pos) {
+ // Insert between existing ranges at position start_pos.
+ if (start_pos < count) {
+ MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
+ }
+ list->at(start_pos) = insert;
+ return count + 1;
+ }
+ if (start_pos + 1 == end_pos) {
+ // Replace single existing range at position start_pos.
+ CharacterRange to_replace = list->at(start_pos);
+ int new_from = std::min(to_replace.from(), from);
+ int new_to = std::max(to_replace.to(), to);
+ list->at(start_pos) = CharacterRange::Range(new_from, new_to);
+ return count;
+ }
+ // Replace a number of existing ranges from start_pos to end_pos - 1.
+ // Move the remaining ranges down.
+
+ int new_from = std::min(list->at(start_pos).from(), from);
+ int new_to = std::max(list->at(end_pos - 1).to(), to);
+ if (end_pos < count) {
+ MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
+ }
+ list->at(start_pos) = CharacterRange::Range(new_from, new_to);
+ return count - (end_pos - start_pos) + 1;
+}
+
+} // namespace
+
+void CharacterSet::Canonicalize() {
+ // Special/default classes are always considered canonical. The result
+ // of calling ranges() will be sorted.
+ if (ranges_ == nullptr) return;
+ CharacterRange::Canonicalize(ranges_);
+}
+
+// static
+void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
+ if (character_ranges->length() <= 1) return;
+ // Check whether ranges are already canonical (increasing, non-overlapping,
+ // non-adjacent).
+ int n = character_ranges->length();
+ base::uc32 max = character_ranges->at(0).to();
+ int i = 1;
+ while (i < n) {
+ CharacterRange current = character_ranges->at(i);
+ if (current.from() <= max + 1) {
+ break;
+ }
+ max = current.to();
+ i++;
+ }
+ // Canonical until the i'th range. If that's all of them, we are done.
+ if (i == n) return;
+
+ // The ranges at index i and forward are not canonicalized. Make them so by
+ // doing the equivalent of insertion sort (inserting each into the previous
+ // list, in order).
+ // Notice that inserting a range can reduce the number of ranges in the
+ // result due to combining of adjacent and overlapping ranges.
+ int read = i; // Range to insert.
+ int num_canonical = i; // Length of canonicalized part of list.
+ do {
+ num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
+ character_ranges->at(read));
+ read++;
+ } while (read < n);
+ character_ranges->Rewind(num_canonical);
+
+ DCHECK(CharacterRange::IsCanonical(character_ranges));
+}
+
+// static
+void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,
+ ZoneList<CharacterRange>* negated_ranges,
+ Zone* zone) {
+ DCHECK(CharacterRange::IsCanonical(ranges));
+ DCHECK_EQ(0, negated_ranges->length());
+ int range_count = ranges->length();
+ base::uc32 from = 0;
+ int i = 0;
+ if (range_count > 0 && ranges->at(0).from() == 0) {
+ from = ranges->at(0).to() + 1;
+ i = 1;
+ }
+ while (i < range_count) {
+ CharacterRange range = ranges->at(i);
+ negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
+ from = range.to() + 1;
+ i++;
+ }
+ if (from < kMaxCodePoint) {
+ negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
+ }
+}
+
+// static
+void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,
+ const ZoneList<CharacterRange>* rhs,
+ ZoneList<CharacterRange>* intersection,
+ Zone* zone) {
+ DCHECK(CharacterRange::IsCanonical(lhs));
+ DCHECK(CharacterRange::IsCanonical(rhs));
+ DCHECK_EQ(0, intersection->length());
+ int lhs_index = 0;
+ int rhs_index = 0;
+ while (lhs_index < lhs->length() && rhs_index < rhs->length()) {
+ // Skip non-overlapping ranges.
+ if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {
+ lhs_index++;
+ continue;
+ }
+ if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {
+ rhs_index++;
+ continue;
+ }
+
+ base::uc32 from =
+ std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());
+ base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());
+ intersection->Add(CharacterRange::Range(from, to), zone);
+ if (to == lhs->at(lhs_index).to()) {
+ lhs_index++;
+ } else {
+ rhs_index++;
+ }
+ }
+
+ DCHECK(IsCanonical(intersection));
+}
+
+namespace {
+
+// Advance |index| and set |from| and |to| to the new range, if not out of
+// bounds of |range|, otherwise |from| is set to a code point beyond the legal
+// unicode character range.
+void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
+ base::uc32* from, base::uc32* to) {
+ ++(*index);
+ if (*index < range->length()) {
+ *from = range->at(*index).from();
+ *to = range->at(*index).to();
+ } else {
+ *from = kMaxCodePoint + 1;
+ }
+}
+
+} // namespace
+
+// static
+void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,
+ const ZoneList<CharacterRange>* to_remove,
+ ZoneList<CharacterRange>* result, Zone* zone) {
+ DCHECK(CharacterRange::IsCanonical(src));
+ DCHECK(CharacterRange::IsCanonical(to_remove));
+ DCHECK_EQ(0, result->length());
+
+ if (src->is_empty()) return;
+
+ int src_index = 0;
+ int to_remove_index = 0;
+ base::uc32 from = src->at(src_index).from();
+ base::uc32 to = src->at(src_index).to();
+ while (src_index < src->length() && to_remove_index < to_remove->length()) {
+ CharacterRange remove_range = to_remove->at(to_remove_index);
+ if (remove_range.to() < from) {
+ // (a) Non-overlapping case, ignore current to_remove range.
+ // |-------|
+ // |-------|
+ to_remove_index++;
+ } else if (to < remove_range.from()) {
+ // (b) Non-overlapping case, add full current range to result.
+ // |-------|
+ // |-------|
+ result->Add(CharacterRange::Range(from, to), zone);
+ SafeAdvanceRange(src, &src_index, &from, &to);
+ } else if (from >= remove_range.from() && to <= remove_range.to()) {
+ // (c) Current to_remove range fully covers current range.
+ // |---|
+ // |-------|
+ SafeAdvanceRange(src, &src_index, &from, &to);
+ } else if (from < remove_range.from() && to > remove_range.to()) {
+ // (d) Split current range.
+ // |-------|
+ // |---|
+ result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);
+ from = remove_range.to() + 1;
+ to_remove_index++;
+ } else if (from < remove_range.from()) {
+ // (e) End current range.
+ // |-------|
+ // |-------|
+ to = remove_range.from() - 1;
+ result->Add(CharacterRange::Range(from, to), zone);
+ SafeAdvanceRange(src, &src_index, &from, &to);
+ } else if (to > remove_range.to()) {
+ // (f) Modify start of current range.
+ // |-------|
+ // |-------|
+ from = remove_range.to() + 1;
+ to_remove_index++;
+ } else {
+ UNREACHABLE();
+ }
+ }
+ // The last range needs special treatment after |to_remove| is exhausted, as
+ // |from| might have been modified by the last |to_remove| range and |to| was
+ // not yet known (i.e. cases d and f).
+ if (from <= to) {
+ result->Add(CharacterRange::Range(from, to), zone);
+ }
+ src_index++;
+
+ // Add remaining ranges after |to_remove| is exhausted.
+ for (; src_index < src->length(); src_index++) {
+ result->Add(src->at(src_index), zone);
+ }
+
+ DCHECK(IsCanonical(result));
+}
+
+// static
+void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
+ DCHECK(IsCanonical(ranges));
+
+ // Drop all ranges that don't contain one-byte code units, and clamp the last
+ // range s.t. it likewise only contains one-byte code units. Note this relies
+ // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
+
+ static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
+ int n = ranges->length();
+ for (; n > 0; n--) {
+ CharacterRange& r = ranges->at(n - 1);
+ if (r.from() <= max_char) {
+ r.to_ = std::min(r.to_, max_char);
+ break;
+ }
+ }
+
+ ranges->Rewind(n);
+}
+
+// static
+bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,
+ const ZoneList<CharacterRange>* rhs) {
+ DCHECK(IsCanonical(lhs));
+ DCHECK(IsCanonical(rhs));
+ if (lhs->length() != rhs->length()) return false;
+
+ for (int i = 0; i < lhs->length(); i++) {
+ if (lhs->at(i) != rhs->at(i)) return false;
+ }
+
+ return true;
+}
+
+namespace {
+
+// Scoped object to keep track of how much we unroll quantifier loops in the
+// regexp graph generator.
+class RegExpExpansionLimiter {
+ public:
+ static const int kMaxExpansionFactor = 6;
+ RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
+ : compiler_(compiler),
+ saved_expansion_factor_(compiler->current_expansion_factor()),
+ ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
+ DCHECK_LT(0, factor);
+ if (ok_to_expand_) {
+ if (factor > kMaxExpansionFactor) {
+ // Avoid integer overflow of the current expansion factor.
+ ok_to_expand_ = false;
+ compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
+ } else {
+ int new_factor = saved_expansion_factor_ * factor;
+ ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
+ compiler->set_current_expansion_factor(new_factor);
+ }
+ }
+ }
+
+ ~RegExpExpansionLimiter() {
+ compiler_->set_current_expansion_factor(saved_expansion_factor_);
+ }
+
+ bool ok_to_expand() { return ok_to_expand_; }
+
+ private:
+ RegExpCompiler* compiler_;
+ int saved_expansion_factor_;
+ bool ok_to_expand_;
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
+};
+
+} // namespace
+
+RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
+ RegExpTree* body, RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ bool not_at_start) {
+ // x{f, t} becomes this:
+ //
+ // (r++)<-.
+ // | `
+ // | (x)
+ // v ^
+ // (r=0)-->(?)---/ [if r < t]
+ // |
+ // [if r >= f] \----> ...
+ //
+
+ // 15.10.2.5 RepeatMatcher algorithm.
+ // The parser has already eliminated the case where max is 0. In the case
+ // where max_match is zero the parser has removed the quantifier if min was
+ // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
+
+ // If we know that we cannot match zero length then things are a little
+ // simpler since we don't need to make the special zero length match check
+ // from step 2.1. If the min and max are small we can unroll a little in
+ // this case.
+ static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
+ static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
+ if (max == 0) return on_success; // This can happen due to recursion.
+ bool body_can_be_empty = (body->min_match() == 0);
+ int body_start_reg = RegExpCompiler::kNoRegister;
+ Interval capture_registers = body->CaptureRegisters();
+ bool needs_capture_clearing = !capture_registers.is_empty();
+ Zone* zone = compiler->zone();
+
+ if (body_can_be_empty) {
+ body_start_reg = compiler->AllocateRegister();
+ } else if (compiler->optimize() && !needs_capture_clearing) {
+ // Only unroll if there are no captures and the body can't be
+ // empty.
+ {
+ RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
+ if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
+ int new_max = (max == kInfinity) ? max : max - min;
+ // Recurse once to get the loop or optional matches after the fixed
+ // ones.
+ RegExpNode* answer =
+ ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
+ // Unroll the forced matches from 0 to min. This can cause chains of
+ // TextNodes (which the parser does not generate). These should be
+ // combined if it turns out they hinder good code generation.
+ for (int i = 0; i < min; i++) {
+ answer = body->ToNode(compiler, answer);
+ }
+ return answer;
+ }
+ }
+ if (max <= kMaxUnrolledMaxMatches && min == 0) {
+ DCHECK_LT(0, max); // Due to the 'if' above.
+ RegExpExpansionLimiter limiter(compiler, max);
+ if (limiter.ok_to_expand()) {
+ // Unroll the optional matches up to max.
+ RegExpNode* answer = on_success;
+ for (int i = 0; i < max; i++) {
+ ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
+ if (is_greedy) {
+ alternation->AddAlternative(
+ GuardedAlternative(body->ToNode(compiler, answer)));
+ alternation->AddAlternative(GuardedAlternative(on_success));
+ } else {
+ alternation->AddAlternative(GuardedAlternative(on_success));
+ alternation->AddAlternative(
+ GuardedAlternative(body->ToNode(compiler, answer)));
+ }
+ answer = alternation;
+ if (not_at_start && !compiler->read_backward()) {
+ alternation->set_not_at_start();
+ }
+ }
+ return answer;
+ }
+ }
+ }
+ bool has_min = min > 0;
+ bool has_max = max < RegExpTree::kInfinity;
+ bool needs_counter = has_min || has_max;
+ int reg_ctr = needs_counter ? compiler->AllocateRegister()
+ : RegExpCompiler::kNoRegister;
+ LoopChoiceNode* center = zone->New<LoopChoiceNode>(
+ body->min_match() == 0, compiler->read_backward(), min, zone);
+ if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
+ RegExpNode* loop_return =
+ needs_counter ? static_cast<RegExpNode*>(
+ ActionNode::IncrementRegister(reg_ctr, center))
+ : static_cast<RegExpNode*>(center);
+ if (body_can_be_empty) {
+ // If the body can be empty we need to check if it was and then
+ // backtrack.
+ loop_return =
+ ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
+ }
+ RegExpNode* body_node = body->ToNode(compiler, loop_return);
+ if (body_can_be_empty) {
+ // If the body can be empty we need to store the start position
+ // so we can bail out if it was empty.
+ body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
+ }
+ if (needs_capture_clearing) {
+ // Before entering the body of this loop we need to clear captures.
+ body_node = ActionNode::ClearCaptures(capture_registers, body_node);
+ }
+ GuardedAlternative body_alt(body_node);
+ if (has_max) {
+ Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
+ body_alt.AddGuard(body_guard, zone);
+ }
+ GuardedAlternative rest_alt(on_success);
+ if (has_min) {
+ Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
+ rest_alt.AddGuard(rest_guard, zone);
+ }
+ if (is_greedy) {
+ center->AddLoopAlternative(body_alt);
+ center->AddContinueAlternative(rest_alt);
+ } else {
+ center->AddContinueAlternative(rest_alt);
+ center->AddLoopAlternative(body_alt);
+ }
+ if (needs_counter) {
+ return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
+ } else {
+ return center;
+ }
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-compiler.cc b/js/src/irregexp/imported/regexp-compiler.cc
new file mode 100644
index 0000000000..514975d8ed
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-compiler.cc
@@ -0,0 +1,3955 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-compiler.h"
+
+#include "irregexp/imported/regexp-macro-assembler-arch.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "irregexp/imported/special-case.h"
+#include "unicode/locid.h"
+#include "unicode/uniset.h"
+#include "unicode/utypes.h"
+#endif // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
+
+// -------------------------------------------------------------------
+// Implementation of the Irregexp regular expression engine.
+//
+// The Irregexp regular expression engine is intended to be a complete
+// implementation of ECMAScript regular expressions. It generates either
+// bytecodes or native code.
+
+// The Irregexp regexp engine is structured in three steps.
+// 1) The parser generates an abstract syntax tree. See ast.cc.
+// 2) From the AST a node network is created. The nodes are all
+// subclasses of RegExpNode. The nodes represent states when
+// executing a regular expression. Several optimizations are
+// performed on the node network.
+// 3) From the nodes we generate either byte codes or native code
+// that can actually execute the regular expression (perform
+// the search). The code generation step is described in more
+// detail below.
+
+// Code generation.
+//
+// The nodes are divided into four main categories.
+// * Choice nodes
+// These represent places where the regular expression can
+// match in more than one way. For example on entry to an
+// alternation (foo|bar) or a repetition (*, +, ? or {}).
+// * Action nodes
+// These represent places where some action should be
+// performed. Examples include recording the current position
+// in the input string to a register (in order to implement
+// captures) or other actions on register for example in order
+// to implement the counters needed for {} repetitions.
+// * Matching nodes
+// These attempt to match some element part of the input string.
+// Examples of elements include character classes, plain strings
+// or back references.
+// * End nodes
+// These are used to implement the actions required on finding
+// a successful match or failing to find a match.
+//
+// The code generated (whether as byte codes or native code) maintains
+// some state as it runs. This consists of the following elements:
+//
+// * The capture registers. Used for string captures.
+// * Other registers. Used for counters etc.
+// * The current position.
+// * The stack of backtracking information. Used when a matching node
+// fails to find a match and needs to try an alternative.
+//
+// Conceptual regular expression execution model:
+//
+// There is a simple conceptual model of regular expression execution
+// which will be presented first. The actual code generated is a more
+// efficient simulation of the simple conceptual model:
+//
+// * Choice nodes are implemented as follows:
+// For each choice except the last {
+// push current position
+// push backtrack code location
+// <generate code to test for choice>
+// backtrack code location:
+// pop current position
+// }
+// <generate code to test for last choice>
+//
+// * Actions nodes are generated as follows
+// <push affected registers on backtrack stack>
+// <generate code to perform action>
+// push backtrack code location
+// <generate code to test for following nodes>
+// backtrack code location:
+// <pop affected registers to restore their state>
+// <pop backtrack location from stack and go to it>
+//
+// * Matching nodes are generated as follows:
+// if input string matches at current position
+// update current position
+// <generate code to test for following nodes>
+// else
+// <pop backtrack location from stack and go to it>
+//
+// Thus it can be seen that the current position is saved and restored
+// by the choice nodes, whereas the registers are saved and restored by
+// by the action nodes that manipulate them.
+//
+// The other interesting aspect of this model is that nodes are generated
+// at the point where they are needed by a recursive call to Emit(). If
+// the node has already been code generated then the Emit() call will
+// generate a jump to the previously generated code instead. In order to
+// limit recursion it is possible for the Emit() function to put the node
+// on a work list for later generation and instead generate a jump. The
+// destination of the jump is resolved later when the code is generated.
+//
+// Actual regular expression code generation.
+//
+// Code generation is actually more complicated than the above. In order to
+// improve the efficiency of the generated code some optimizations are
+// performed
+//
+// * Choice nodes have 1-character lookahead.
+// A choice node looks at the following character and eliminates some of
+// the choices immediately based on that character. This is not yet
+// implemented.
+// * Simple greedy loops store reduced backtracking information.
+// A quantifier like /.*foo/m will greedily match the whole input. It will
+// then need to backtrack to a point where it can match "foo". The naive
+// implementation of this would push each character position onto the
+// backtracking stack, then pop them off one by one. This would use space
+// proportional to the length of the input string. However since the "."
+// can only match in one way and always has a constant length (in this case
+// of 1) it suffices to store the current position on the top of the stack
+// once. Matching now becomes merely incrementing the current position and
+// backtracking becomes decrementing the current position and checking the
+// result against the stored current position. This is faster and saves
+// space.
+// * The current state is virtualized.
+// This is used to defer expensive operations until it is clear that they
+// are needed and to generate code for a node more than once, allowing
+// specialized an efficient versions of the code to be created. This is
+// explained in the section below.
+//
+// Execution state virtualization.
+//
+// Instead of emitting code, nodes that manipulate the state can record their
+// manipulation in an object called the Trace. The Trace object can record a
+// current position offset, an optional backtrack code location on the top of
+// the virtualized backtrack stack and some register changes. When a node is
+// to be emitted it can flush the Trace or update it. Flushing the Trace
+// will emit code to bring the actual state into line with the virtual state.
+// Avoiding flushing the state can postpone some work (e.g. updates of capture
+// registers). Postponing work can save time when executing the regular
+// expression since it may be found that the work never has to be done as a
+// failure to match can occur. In addition it is much faster to jump to a
+// known backtrack code location than it is to pop an unknown backtrack
+// location from the stack and jump there.
+//
+// The virtual state found in the Trace affects code generation. For example
+// the virtual state contains the difference between the actual current
+// position and the virtual current position, and matching code needs to use
+// this offset to attempt a match in the correct location of the input
+// string. Therefore code generated for a non-trivial trace is specialized
+// to that trace. The code generator therefore has the ability to generate
+// code for each node several times. In order to limit the size of the
+// generated code there is an arbitrary limit on how many specialized sets of
+// code may be generated for a given node. If the limit is reached, the
+// trace is flushed and a generic version of the code for a node is emitted.
+// This is subsequently used for that node. The code emitted for non-generic
+// trace is not recorded in the node and so it cannot currently be reused in
+// the event that code generation is requested for an identical trace.
+
+namespace {
+
+constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
+ static_assert(String::kMaxOneByteCharCodeU <=
+ std::numeric_limits<uint16_t>::max());
+ static_assert(String::kMaxUtf16CodeUnitU <=
+ std::numeric_limits<uint16_t>::max());
+ return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
+}
+
+constexpr uint32_t CharMask(const bool one_byte) {
+ static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
+ static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
+ return MaxCodeUnit(one_byte);
+}
+
+} // namespace
+
+void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
+
+void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
+ text->AddElement(TextElement::Atom(this), zone);
+}
+
+void RegExpClassRanges::AppendToText(RegExpText* text, Zone* zone) {
+ text->AddElement(TextElement::ClassRanges(this), zone);
+}
+
+void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
+ for (int i = 0; i < elements()->length(); i++)
+ text->AddElement(elements()->at(i), zone);
+}
+
+TextElement TextElement::Atom(RegExpAtom* atom) {
+ return TextElement(ATOM, atom);
+}
+
+TextElement TextElement::ClassRanges(RegExpClassRanges* class_ranges) {
+ return TextElement(CLASS_RANGES, class_ranges);
+}
+
+int TextElement::length() const {
+ switch (text_type()) {
+ case ATOM:
+ return atom()->length();
+
+ case CLASS_RANGES:
+ return 1;
+ }
+ UNREACHABLE();
+}
+
+class RecursionCheck {
+ public:
+ explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
+ compiler->IncrementRecursionDepth();
+ }
+ ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
+
+ private:
+ RegExpCompiler* compiler_;
+};
+
+// Attempts to compile the regexp using an Irregexp code generator. Returns
+// a fixed array or a null handle depending on whether it succeeded.
+RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
+ RegExpFlags flags, bool one_byte)
+ : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
+ unicode_lookaround_stack_register_(kNoRegister),
+ unicode_lookaround_position_register_(kNoRegister),
+ work_list_(nullptr),
+ recursion_depth_(0),
+ flags_(flags),
+ one_byte_(one_byte),
+ reg_exp_too_big_(false),
+ limiting_recursion_(false),
+ optimize_(v8_flags.regexp_optimization),
+ read_backward_(false),
+ current_expansion_factor_(1),
+ frequency_collator_(),
+ isolate_(isolate),
+ zone_(zone) {
+ accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
+ DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
+}
+
+RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
+ Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
+ int capture_count, Handle<String> pattern) {
+ macro_assembler_ = macro_assembler;
+
+ ZoneVector<RegExpNode*> work_list(zone());
+ work_list_ = &work_list;
+ Label fail;
+ macro_assembler_->PushBacktrack(&fail);
+ Trace new_trace;
+ start->Emit(this, &new_trace);
+ macro_assembler_->BindJumpTarget(&fail);
+ macro_assembler_->Fail();
+ while (!work_list.empty()) {
+ RegExpNode* node = work_list.back();
+ work_list.pop_back();
+ node->set_on_work_list(false);
+ if (!node->label()->is_bound()) node->Emit(this, &new_trace);
+ }
+ if (reg_exp_too_big_) {
+ if (v8_flags.correctness_fuzzer_suppressions) {
+ FATAL("Aborting on excess zone allocation");
+ }
+ macro_assembler_->AbortedCodeGeneration();
+ return CompilationResult::RegExpTooBig();
+ }
+
+ Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
+ isolate->IncreaseTotalRegexpCodeGenerated(code);
+ work_list_ = nullptr;
+
+ return {code, next_register_};
+}
+
+bool Trace::DeferredAction::Mentions(int that) {
+ if (action_type() == ActionNode::CLEAR_CAPTURES) {
+ Interval range = static_cast<DeferredClearCaptures*>(this)->range();
+ return range.Contains(that);
+ } else {
+ return reg() == that;
+ }
+}
+
+bool Trace::mentions_reg(int reg) {
+ for (DeferredAction* action = actions_; action != nullptr;
+ action = action->next()) {
+ if (action->Mentions(reg)) return true;
+ }
+ return false;
+}
+
+bool Trace::GetStoredPosition(int reg, int* cp_offset) {
+ DCHECK_EQ(0, *cp_offset);
+ for (DeferredAction* action = actions_; action != nullptr;
+ action = action->next()) {
+ if (action->Mentions(reg)) {
+ if (action->action_type() == ActionNode::STORE_POSITION) {
+ *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+ return false;
+}
+
+// A (dynamically-sized) set of unsigned integers that behaves especially well
+// on small integers (< kFirstLimit). May do zone-allocation.
+class DynamicBitSet : public ZoneObject {
+ public:
+ V8_EXPORT_PRIVATE bool Get(unsigned value) const {
+ if (value < kFirstLimit) {
+ return (first_ & (1 << value)) != 0;
+ } else if (remaining_ == nullptr) {
+ return false;
+ } else {
+ return remaining_->Contains(value);
+ }
+ }
+
+ // Destructively set a value in this set.
+ void Set(unsigned value, Zone* zone) {
+ if (value < kFirstLimit) {
+ first_ |= (1 << value);
+ } else {
+ if (remaining_ == nullptr)
+ remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
+ if (remaining_->is_empty() || !remaining_->Contains(value))
+ remaining_->Add(value, zone);
+ }
+ }
+
+ private:
+ static constexpr unsigned kFirstLimit = 32;
+
+ uint32_t first_ = 0;
+ ZoneList<unsigned>* remaining_ = nullptr;
+};
+
+int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
+ Zone* zone) {
+ int max_register = RegExpCompiler::kNoRegister;
+ for (DeferredAction* action = actions_; action != nullptr;
+ action = action->next()) {
+ if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
+ Interval range = static_cast<DeferredClearCaptures*>(action)->range();
+ for (int i = range.from(); i <= range.to(); i++)
+ affected_registers->Set(i, zone);
+ if (range.to() > max_register) max_register = range.to();
+ } else {
+ affected_registers->Set(action->reg(), zone);
+ if (action->reg() > max_register) max_register = action->reg();
+ }
+ }
+ return max_register;
+}
+
+void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
+ int max_register,
+ const DynamicBitSet& registers_to_pop,
+ const DynamicBitSet& registers_to_clear) {
+ for (int reg = max_register; reg >= 0; reg--) {
+ if (registers_to_pop.Get(reg)) {
+ assembler->PopRegister(reg);
+ } else if (registers_to_clear.Get(reg)) {
+ int clear_to = reg;
+ while (reg > 0 && registers_to_clear.Get(reg - 1)) {
+ reg--;
+ }
+ assembler->ClearRegisters(reg, clear_to);
+ }
+ }
+}
+
+void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
+ int max_register,
+ const DynamicBitSet& affected_registers,
+ DynamicBitSet* registers_to_pop,
+ DynamicBitSet* registers_to_clear,
+ Zone* zone) {
+ // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
+ const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
+
+ // Count pushes performed to force a stack limit check occasionally.
+ int pushes = 0;
+
+ for (int reg = 0; reg <= max_register; reg++) {
+ if (!affected_registers.Get(reg)) continue;
+
+ // The chronologically first deferred action in the trace
+ // is used to infer the action needed to restore a register
+ // to its previous state (or not, if it's safe to ignore it).
+ enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
+ DeferredActionUndoType undo_action = IGNORE;
+
+ int value = 0;
+ bool absolute = false;
+ bool clear = false;
+ static const int kNoStore = kMinInt;
+ int store_position = kNoStore;
+ // This is a little tricky because we are scanning the actions in reverse
+ // historical order (newest first).
+ for (DeferredAction* action = actions_; action != nullptr;
+ action = action->next()) {
+ if (action->Mentions(reg)) {
+ switch (action->action_type()) {
+ case ActionNode::SET_REGISTER_FOR_LOOP: {
+ Trace::DeferredSetRegisterForLoop* psr =
+ static_cast<Trace::DeferredSetRegisterForLoop*>(action);
+ if (!absolute) {
+ value += psr->value();
+ absolute = true;
+ }
+ // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
+ // counters. They can have a significant previous value if they
+ // occur in a loop. TODO(lrn): Propagate this information, so
+ // we can set undo_action to IGNORE if we know there is no value to
+ // restore.
+ undo_action = RESTORE;
+ DCHECK_EQ(store_position, kNoStore);
+ DCHECK(!clear);
+ break;
+ }
+ case ActionNode::INCREMENT_REGISTER:
+ if (!absolute) {
+ value++;
+ }
+ DCHECK_EQ(store_position, kNoStore);
+ DCHECK(!clear);
+ undo_action = RESTORE;
+ break;
+ case ActionNode::STORE_POSITION: {
+ Trace::DeferredCapture* pc =
+ static_cast<Trace::DeferredCapture*>(action);
+ if (!clear && store_position == kNoStore) {
+ store_position = pc->cp_offset();
+ }
+
+ // For captures we know that stores and clears alternate.
+ // Other register, are never cleared, and if the occur
+ // inside a loop, they might be assigned more than once.
+ if (reg <= 1) {
+ // Registers zero and one, aka "capture zero", is
+ // always set correctly if we succeed. There is no
+ // need to undo a setting on backtrack, because we
+ // will set it again or fail.
+ undo_action = IGNORE;
+ } else {
+ undo_action = pc->is_capture() ? CLEAR : RESTORE;
+ }
+ DCHECK(!absolute);
+ DCHECK_EQ(value, 0);
+ break;
+ }
+ case ActionNode::CLEAR_CAPTURES: {
+ // Since we're scanning in reverse order, if we've already
+ // set the position we have to ignore historically earlier
+ // clearing operations.
+ if (store_position == kNoStore) {
+ clear = true;
+ }
+ undo_action = RESTORE;
+ DCHECK(!absolute);
+ DCHECK_EQ(value, 0);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+ }
+ }
+ // Prepare for the undo-action (e.g., push if it's going to be popped).
+ if (undo_action == RESTORE) {
+ pushes++;
+ RegExpMacroAssembler::StackCheckFlag stack_check =
+ RegExpMacroAssembler::kNoStackLimitCheck;
+ if (pushes == push_limit) {
+ stack_check = RegExpMacroAssembler::kCheckStackLimit;
+ pushes = 0;
+ }
+
+ assembler->PushRegister(reg, stack_check);
+ registers_to_pop->Set(reg, zone);
+ } else if (undo_action == CLEAR) {
+ registers_to_clear->Set(reg, zone);
+ }
+ // Perform the chronologically last action (or accumulated increment)
+ // for the register.
+ if (store_position != kNoStore) {
+ assembler->WriteCurrentPositionToRegister(reg, store_position);
+ } else if (clear) {
+ assembler->ClearRegisters(reg, reg);
+ } else if (absolute) {
+ assembler->SetRegister(reg, value);
+ } else if (value != 0) {
+ assembler->AdvanceRegister(reg, value);
+ }
+ }
+}
+
+// This is called as we come into a loop choice node and some other tricky
+// nodes. It normalizes the state of the code generator to ensure we can
+// generate generic code.
+void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+
+ DCHECK(!is_trivial());
+
+ if (actions_ == nullptr && backtrack() == nullptr) {
+ // Here we just have some deferred cp advances to fix and we are back to
+ // a normal situation. We may also have to forget some information gained
+ // through a quick check that was already performed.
+ if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
+ // Create a new trivial state and generate the node with that.
+ Trace new_state;
+ successor->Emit(compiler, &new_state);
+ return;
+ }
+
+ // Generate deferred actions here along with code to undo them again.
+ DynamicBitSet affected_registers;
+
+ if (backtrack() != nullptr) {
+ // Here we have a concrete backtrack location. These are set up by choice
+ // nodes and so they indicate that we have a deferred save of the current
+ // position which we may need to emit here.
+ assembler->PushCurrentPosition();
+ }
+
+ int max_register =
+ FindAffectedRegisters(&affected_registers, compiler->zone());
+ DynamicBitSet registers_to_pop;
+ DynamicBitSet registers_to_clear;
+ PerformDeferredActions(assembler, max_register, affected_registers,
+ &registers_to_pop, &registers_to_clear,
+ compiler->zone());
+ if (cp_offset_ != 0) {
+ assembler->AdvanceCurrentPosition(cp_offset_);
+ }
+
+ // Create a new trivial state and generate the node with that.
+ Label undo;
+ assembler->PushBacktrack(&undo);
+ if (successor->KeepRecursing(compiler)) {
+ Trace new_state;
+ successor->Emit(compiler, &new_state);
+ } else {
+ compiler->AddWork(successor);
+ assembler->GoTo(successor->label());
+ }
+
+ // On backtrack we need to restore state.
+ assembler->BindJumpTarget(&undo);
+ RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
+ registers_to_clear);
+ if (backtrack() == nullptr) {
+ assembler->Backtrack();
+ } else {
+ assembler->PopCurrentPosition();
+ assembler->GoTo(backtrack());
+ }
+}
+
+void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+
+ // Omit flushing the trace. We discard the entire stack frame anyway.
+
+ if (!label()->is_bound()) {
+ // We are completely independent of the trace, since we ignore it,
+ // so this code can be used as the generic version.
+ assembler->Bind(label());
+ }
+
+ // Throw away everything on the backtrack stack since the start
+ // of the negative submatch and restore the character position.
+ assembler->ReadCurrentPositionFromRegister(current_position_register_);
+ assembler->ReadStackPointerFromRegister(stack_pointer_register_);
+ if (clear_capture_count_ > 0) {
+ // Clear any captures that might have been performed during the success
+ // of the body of the negative look-ahead.
+ int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
+ assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
+ }
+ // Now that we have unwound the stack we find at the top of the stack the
+ // backtrack that the BeginNegativeSubmatch node got.
+ assembler->Backtrack();
+}
+
+void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ if (!trace->is_trivial()) {
+ trace->Flush(compiler, this);
+ return;
+ }
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ if (!label()->is_bound()) {
+ assembler->Bind(label());
+ }
+ switch (action_) {
+ case ACCEPT:
+ assembler->Succeed();
+ return;
+ case BACKTRACK:
+ assembler->GoTo(trace->backtrack());
+ return;
+ case NEGATIVE_SUBMATCH_SUCCESS:
+ // This case is handled in a different virtual method.
+ UNREACHABLE();
+ }
+ UNIMPLEMENTED();
+}
+
+void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
+ if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
+ guards_->Add(guard, zone);
+}
+
+ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
+ RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
+ result->data_.u_store_register.reg = reg;
+ result->data_.u_store_register.value = val;
+ return result;
+}
+
+ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
+ result->data_.u_increment_register.reg = reg;
+ return result;
+}
+
+ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
+ RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
+ result->data_.u_position_register.reg = reg;
+ result->data_.u_position_register.is_capture = is_capture;
+ return result;
+}
+
+ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
+ result->data_.u_clear_captures.range_from = range.from();
+ result->data_.u_clear_captures.range_to = range.to();
+ return result;
+}
+
+ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
+ RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success);
+ result->data_.u_submatch.stack_pointer_register = stack_reg;
+ result->data_.u_submatch.current_position_register = position_reg;
+ return result;
+}
+
+ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
+ RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
+ result->data_.u_submatch.stack_pointer_register = stack_reg;
+ result->data_.u_submatch.current_position_register = position_reg;
+ return result;
+}
+
+ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
+ int clear_register_count,
+ int clear_register_from,
+ RegExpNode* on_success) {
+ ActionNode* result = on_success->zone()->New<ActionNode>(
+ POSITIVE_SUBMATCH_SUCCESS, on_success);
+ result->data_.u_submatch.stack_pointer_register = stack_reg;
+ result->data_.u_submatch.current_position_register = position_reg;
+ result->data_.u_submatch.clear_register_count = clear_register_count;
+ result->data_.u_submatch.clear_register_from = clear_register_from;
+ return result;
+}
+
+ActionNode* ActionNode::EmptyMatchCheck(int start_register,
+ int repetition_register,
+ int repetition_limit,
+ RegExpNode* on_success) {
+ ActionNode* result =
+ on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
+ result->data_.u_empty_match_check.start_register = start_register;
+ result->data_.u_empty_match_check.repetition_register = repetition_register;
+ result->data_.u_empty_match_check.repetition_limit = repetition_limit;
+ return result;
+}
+
+#define DEFINE_ACCEPT(Type) \
+ void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
+FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
+#undef DEFINE_ACCEPT
+
+// -------------------------------------------------------------------
+// Emit code.
+
+void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
+ Guard* guard, Trace* trace) {
+ switch (guard->op()) {
+ case Guard::LT:
+ DCHECK(!trace->mentions_reg(guard->reg()));
+ macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
+ trace->backtrack());
+ break;
+ case Guard::GEQ:
+ DCHECK(!trace->mentions_reg(guard->reg()));
+ macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
+ trace->backtrack());
+ break;
+ }
+}
+
+namespace {
+
+#ifdef DEBUG
+bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
+ static_assert(sizeof(unibrow::uchar) == 4);
+ for (int i = 0; i < length; i++) {
+ if (chars[i] > String::kMaxUtf16CodeUnit) return false;
+ }
+ return true;
+}
+#endif // DEBUG
+
+// Returns the number of characters in the equivalence class, omitting those
+// that cannot occur in the source string because it is Latin1.
+int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
+ bool one_byte_subject, unibrow::uchar* letters,
+ int letter_length) {
+#ifdef V8_INTL_SUPPORT
+ if (RegExpCaseFolding::IgnoreSet().contains(character)) {
+ letters[0] = character;
+ DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
+ return 1;
+ }
+ bool in_special_add_set =
+ RegExpCaseFolding::SpecialAddSet().contains(character);
+
+ icu::UnicodeSet set;
+ set.add(character);
+ set = set.closeOver(USET_CASE_INSENSITIVE);
+
+ UChar32 canon = 0;
+ if (in_special_add_set) {
+ canon = RegExpCaseFolding::Canonicalize(character);
+ }
+
+ int32_t range_count = set.getRangeCount();
+ int items = 0;
+ for (int32_t i = 0; i < range_count; i++) {
+ UChar32 start = set.getRangeStart(i);
+ UChar32 end = set.getRangeEnd(i);
+ CHECK(end - start + items <= letter_length);
+ for (UChar32 cu = start; cu <= end; cu++) {
+ if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
+ if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
+ continue;
+ }
+ letters[items++] = static_cast<unibrow::uchar>(cu);
+ }
+ }
+ DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
+ return items;
+#else
+ int length =
+ isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
+ // Unibrow returns 0 or 1 for characters where case independence is
+ // trivial.
+ if (length == 0) {
+ letters[0] = character;
+ length = 1;
+ }
+
+ if (one_byte_subject) {
+ int new_length = 0;
+ for (int i = 0; i < length; i++) {
+ if (letters[i] <= String::kMaxOneByteCharCode) {
+ letters[new_length++] = letters[i];
+ }
+ }
+ length = new_length;
+ }
+
+ DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
+ return length;
+#endif // V8_INTL_SUPPORT
+}
+
+inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
+ base::uc16 c, Label* on_failure, int cp_offset,
+ bool check, bool preloaded) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ bool bound_checked = false;
+ if (!preloaded) {
+ assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
+ bound_checked = true;
+ }
+ assembler->CheckNotCharacter(c, on_failure);
+ return bound_checked;
+}
+
+// Only emits non-letters (things that don't have case). Only used for case
+// independent matches.
+inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
+ base::uc16 c, Label* on_failure, int cp_offset,
+ bool check, bool preloaded) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ bool one_byte = compiler->one_byte();
+ unibrow::uchar chars[4];
+ int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
+ if (length < 1) {
+ // This can't match. Must be an one-byte subject and a non-one-byte
+ // character. We do not need to do anything since the one-byte pass
+ // already handled this.
+ return false; // Bounds not checked.
+ }
+ bool checked = false;
+ // We handle the length > 1 case in a later pass.
+ if (length == 1) {
+ if (one_byte && c > String::kMaxOneByteCharCodeU) {
+ // Can't match - see above.
+ return false; // Bounds not checked.
+ }
+ if (!preloaded) {
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
+ checked = check;
+ }
+ macro_assembler->CheckNotCharacter(c, on_failure);
+ }
+ return checked;
+}
+
+bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
+ bool one_byte, base::uc16 c1, base::uc16 c2,
+ Label* on_failure) {
+ const uint32_t char_mask = CharMask(one_byte);
+ base::uc16 exor = c1 ^ c2;
+ // Check whether exor has only one bit set.
+ if (((exor - 1) & exor) == 0) {
+ // If c1 and c2 differ only by one bit.
+ // Ecma262UnCanonicalize always gives the highest number last.
+ DCHECK(c2 > c1);
+ base::uc16 mask = char_mask ^ exor;
+ macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
+ return true;
+ }
+ DCHECK(c2 > c1);
+ base::uc16 diff = c2 - c1;
+ if (((diff - 1) & diff) == 0 && c1 >= diff) {
+ // If the characters differ by 2^n but don't differ by one bit then
+ // subtract the difference from the found character, then do the or
+ // trick. We avoid the theoretical case where negative numbers are
+ // involved in order to simplify code generation.
+ base::uc16 mask = char_mask ^ diff;
+ macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
+ on_failure);
+ return true;
+ }
+ return false;
+}
+
+// Only emits letters (things that have case). Only used for case independent
+// matches.
+inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
+ base::uc16 c, Label* on_failure, int cp_offset,
+ bool check, bool preloaded) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ bool one_byte = compiler->one_byte();
+ unibrow::uchar chars[4];
+ int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
+ if (length <= 1) return false;
+ // We may not need to check against the end of the input string
+ // if this character lies before a character that matched.
+ if (!preloaded) {
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
+ }
+ Label ok;
+ switch (length) {
+ case 2: {
+ if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
+ chars[1], on_failure)) {
+ } else {
+ macro_assembler->CheckCharacter(chars[0], &ok);
+ macro_assembler->CheckNotCharacter(chars[1], on_failure);
+ macro_assembler->Bind(&ok);
+ }
+ break;
+ }
+ case 4:
+ macro_assembler->CheckCharacter(chars[3], &ok);
+ V8_FALLTHROUGH;
+ case 3:
+ macro_assembler->CheckCharacter(chars[0], &ok);
+ macro_assembler->CheckCharacter(chars[1], &ok);
+ macro_assembler->CheckNotCharacter(chars[2], on_failure);
+ macro_assembler->Bind(&ok);
+ break;
+ default:
+ UNREACHABLE();
+ }
+ return true;
+}
+
+void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
+ Label* fall_through, Label* above_or_equal,
+ Label* below) {
+ if (below != fall_through) {
+ masm->CheckCharacterLT(border, below);
+ if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
+ } else {
+ masm->CheckCharacterGT(border - 1, above_or_equal);
+ }
+}
+
+void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
+ Label* fall_through, Label* in_range,
+ Label* out_of_range) {
+ if (in_range == fall_through) {
+ if (first == last) {
+ masm->CheckNotCharacter(first, out_of_range);
+ } else {
+ masm->CheckCharacterNotInRange(first, last, out_of_range);
+ }
+ } else {
+ if (first == last) {
+ masm->CheckCharacter(first, in_range);
+ } else {
+ masm->CheckCharacterInRange(first, last, in_range);
+ }
+ if (out_of_range != fall_through) masm->GoTo(out_of_range);
+ }
+}
+
+// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
+// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
+void EmitUseLookupTable(RegExpMacroAssembler* masm,
+ ZoneList<base::uc32>* ranges, uint32_t start_index,
+ uint32_t end_index, base::uc32 min_char,
+ Label* fall_through, Label* even_label,
+ Label* odd_label) {
+ static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
+ static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
+
+ base::uc32 base = (min_char & ~kMask);
+ USE(base);
+
+ // Assert that everything is on one kTableSize page.
+ for (uint32_t i = start_index; i <= end_index; i++) {
+ DCHECK_EQ(ranges->at(i) & ~kMask, base);
+ }
+ DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
+
+ char templ[kSize];
+ Label* on_bit_set;
+ Label* on_bit_clear;
+ int bit;
+ if (even_label == fall_through) {
+ on_bit_set = odd_label;
+ on_bit_clear = even_label;
+ bit = 1;
+ } else {
+ on_bit_set = even_label;
+ on_bit_clear = odd_label;
+ bit = 0;
+ }
+ for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
+ i++) {
+ templ[i] = bit;
+ }
+ uint32_t j = 0;
+ bit ^= 1;
+ for (uint32_t i = start_index; i < end_index; i++) {
+ for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
+ templ[j] = bit;
+ }
+ bit ^= 1;
+ }
+ for (uint32_t i = j; i < kSize; i++) {
+ templ[i] = bit;
+ }
+ Factory* factory = masm->isolate()->factory();
+ // TODO(erikcorry): Cache these.
+ Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
+ for (uint32_t i = 0; i < kSize; i++) {
+ ba->set(i, templ[i]);
+ }
+ masm->CheckBitInTable(ba, on_bit_set);
+ if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
+}
+
+void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
+ uint32_t start_index, uint32_t end_index, uint32_t cut_index,
+ Label* even_label, Label* odd_label) {
+ bool odd = (((cut_index - start_index) & 1) == 1);
+ Label* in_range_label = odd ? odd_label : even_label;
+ Label dummy;
+ EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
+ ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
+ &dummy);
+ DCHECK(!dummy.is_linked());
+ // Cut out the single range by rewriting the array. This creates a new
+ // range that is a merger of the two ranges on either side of the one we
+ // are cutting out. The oddity of the labels is preserved.
+ for (uint32_t j = cut_index; j > start_index; j--) {
+ ranges->at(j) = ranges->at(j - 1);
+ }
+ for (uint32_t j = cut_index + 1; j < end_index; j++) {
+ ranges->at(j) = ranges->at(j + 1);
+ }
+}
+
+// Unicode case. Split the search space into kSize spaces that are handled
+// with recursion.
+void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
+ uint32_t end_index, uint32_t* new_start_index,
+ uint32_t* new_end_index, base::uc32* border) {
+ static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
+ static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
+
+ base::uc32 first = ranges->at(start_index);
+ base::uc32 last = ranges->at(end_index) - 1;
+
+ *new_start_index = start_index;
+ *border = (ranges->at(start_index) & ~kMask) + kSize;
+ while (*new_start_index < end_index) {
+ if (ranges->at(*new_start_index) > *border) break;
+ (*new_start_index)++;
+ }
+ // new_start_index is the index of the first edge that is beyond the
+ // current kSize space.
+
+ // For very large search spaces we do a binary chop search of the non-Latin1
+ // space instead of just going to the end of the current kSize space. The
+ // heuristics are complicated a little by the fact that any 128-character
+ // encoding space can be quickly tested with a table lookup, so we don't
+ // wish to do binary chop search at a smaller granularity than that. A
+ // 128-character space can take up a lot of space in the ranges array if,
+ // for example, we only want to match every second character (eg. the lower
+ // case characters on some Unicode pages).
+ uint32_t binary_chop_index = (end_index + start_index) / 2;
+ // The first test ensures that we get to the code that handles the Latin1
+ // range with a single not-taken branch, speeding up this important
+ // character range (even non-Latin1 charset-based text has spaces and
+ // punctuation).
+ if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
+ end_index - start_index > (*new_start_index - start_index) * 2 &&
+ last - first > kSize * 2 && binary_chop_index > *new_start_index &&
+ ranges->at(binary_chop_index) >= first + 2 * kSize) {
+ uint32_t scan_forward_for_section_border = binary_chop_index;
+ uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
+
+ while (scan_forward_for_section_border < end_index) {
+ if (ranges->at(scan_forward_for_section_border) > new_border) {
+ *new_start_index = scan_forward_for_section_border;
+ *border = new_border;
+ break;
+ }
+ scan_forward_for_section_border++;
+ }
+ }
+
+ DCHECK(*new_start_index > start_index);
+ *new_end_index = *new_start_index - 1;
+ if (ranges->at(*new_end_index) == *border) {
+ (*new_end_index)--;
+ }
+ if (*border >= ranges->at(end_index)) {
+ *border = ranges->at(end_index);
+ *new_start_index = end_index; // Won't be used.
+ *new_end_index = end_index - 1;
+ }
+}
+
+// Gets a series of segment boundaries representing a character class. If the
+// character is in the range between an even and an odd boundary (counting from
+// start_index) then go to even_label, otherwise go to odd_label. We already
+// know that the character is in the range of min_char to max_char inclusive.
+// Either label can be nullptr indicating backtracking. Either label can also
+// be equal to the fall_through label.
+void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
+ uint32_t start_index, uint32_t end_index,
+ base::uc32 min_char, base::uc32 max_char,
+ Label* fall_through, Label* even_label,
+ Label* odd_label) {
+ DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
+ DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
+
+ base::uc32 first = ranges->at(start_index);
+ base::uc32 last = ranges->at(end_index) - 1;
+
+ DCHECK_LT(min_char, first);
+
+ // Just need to test if the character is before or on-or-after
+ // a particular character.
+ if (start_index == end_index) {
+ EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
+ return;
+ }
+
+ // Another almost trivial case: There is one interval in the middle that is
+ // different from the end intervals.
+ if (start_index + 1 == end_index) {
+ EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
+ odd_label);
+ return;
+ }
+
+ // It's not worth using table lookup if there are very few intervals in the
+ // character class.
+ if (end_index - start_index <= 6) {
+ // It is faster to test for individual characters, so we look for those
+ // first, then try arbitrary ranges in the second round.
+ static uint32_t kNoCutIndex = -1;
+ uint32_t cut = kNoCutIndex;
+ for (uint32_t i = start_index; i < end_index; i++) {
+ if (ranges->at(i) == ranges->at(i + 1) - 1) {
+ cut = i;
+ break;
+ }
+ }
+ if (cut == kNoCutIndex) cut = start_index;
+ CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
+ odd_label);
+ DCHECK_GE(end_index - start_index, 2);
+ GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
+ max_char, fall_through, even_label, odd_label);
+ return;
+ }
+
+ // If there are a lot of intervals in the regexp, then we will use tables to
+ // determine whether the character is inside or outside the character class.
+ static const int kBits = RegExpMacroAssembler::kTableSizeBits;
+
+ if ((max_char >> kBits) == (min_char >> kBits)) {
+ EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
+ fall_through, even_label, odd_label);
+ return;
+ }
+
+ if ((min_char >> kBits) != first >> kBits) {
+ masm->CheckCharacterLT(first, odd_label);
+ GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
+ fall_through, odd_label, even_label);
+ return;
+ }
+
+ uint32_t new_start_index = 0;
+ uint32_t new_end_index = 0;
+ base::uc32 border = 0;
+
+ SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
+ &new_end_index, &border);
+
+ Label handle_rest;
+ Label* above = &handle_rest;
+ if (border == last + 1) {
+ // We didn't find any section that started after the limit, so everything
+ // above the border is one of the terminal labels.
+ above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
+ DCHECK(new_end_index == end_index - 1);
+ }
+
+ DCHECK_LE(start_index, new_end_index);
+ DCHECK_LE(new_start_index, end_index);
+ DCHECK_LT(start_index, new_start_index);
+ DCHECK_LT(new_end_index, end_index);
+ DCHECK(new_end_index + 1 == new_start_index ||
+ (new_end_index + 2 == new_start_index &&
+ border == ranges->at(new_end_index + 1)));
+ DCHECK_LT(min_char, border - 1);
+ DCHECK_LT(border, max_char);
+ DCHECK_LT(ranges->at(new_end_index), border);
+ DCHECK(border < ranges->at(new_start_index) ||
+ (border == ranges->at(new_start_index) &&
+ new_start_index == end_index && new_end_index == end_index - 1 &&
+ border == last + 1));
+ DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
+
+ masm->CheckCharacterGT(border - 1, above);
+ Label dummy;
+ GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
+ border - 1, &dummy, even_label, odd_label);
+ if (handle_rest.is_linked()) {
+ masm->Bind(&handle_rest);
+ bool flip = (new_start_index & 1) != (start_index & 1);
+ GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
+ &dummy, flip ? odd_label : even_label,
+ flip ? even_label : odd_label);
+ }
+}
+
+void EmitClassRanges(RegExpMacroAssembler* macro_assembler,
+ RegExpClassRanges* cr, bool one_byte, Label* on_failure,
+ int cp_offset, bool check_offset, bool preloaded,
+ Zone* zone) {
+ ZoneList<CharacterRange>* ranges = cr->ranges(zone);
+ CharacterRange::Canonicalize(ranges);
+
+ // Now that all processing (like case-insensitivity) is done, clamp the
+ // ranges to the set of ranges that may actually occur in the subject string.
+ if (one_byte) CharacterRange::ClampToOneByte(ranges);
+
+ const int ranges_length = ranges->length();
+ if (ranges_length == 0) {
+ if (!cr->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ }
+ if (check_offset) {
+ macro_assembler->CheckPosition(cp_offset, on_failure);
+ }
+ return;
+ }
+
+ const base::uc32 max_char = MaxCodeUnit(one_byte);
+ if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
+ if (cr->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ } else {
+ // This is a common case hit by non-anchored expressions.
+ if (check_offset) {
+ macro_assembler->CheckPosition(cp_offset, on_failure);
+ }
+ }
+ return;
+ }
+
+ if (!preloaded) {
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
+ }
+
+ if (cr->is_standard(zone) && macro_assembler->CheckSpecialClassRanges(
+ cr->standard_type(), on_failure)) {
+ return;
+ }
+
+ static constexpr int kMaxRangesForInlineBranchGeneration = 16;
+ if (ranges_length > kMaxRangesForInlineBranchGeneration) {
+ // For large range sets, emit a more compact instruction sequence to avoid
+ // a potentially problematic increase in code size.
+ // Note the flipped logic below (we check InRange if negated, NotInRange if
+ // not negated); this is necessary since the method falls through on
+ // failure whereas we want to fall through on success.
+ if (cr->is_negated()) {
+ if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
+ return;
+ }
+ } else {
+ if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
+ return;
+ }
+ }
+ }
+
+ // Generate a flat list of range boundaries for consumption by
+ // GenerateBranches. See the comment on that function for how the list should
+ // be structured
+ ZoneList<base::uc32>* range_boundaries =
+ zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
+
+ bool zeroth_entry_is_failure = !cr->is_negated();
+
+ for (int i = 0; i < ranges_length; i++) {
+ CharacterRange& range = ranges->at(i);
+ if (range.from() == 0) {
+ DCHECK_EQ(i, 0);
+ zeroth_entry_is_failure = !zeroth_entry_is_failure;
+ } else {
+ range_boundaries->Add(range.from(), zone);
+ }
+ // `+ 1` to convert from inclusive to exclusive `to`.
+ // [from, to] == [from, to+1[.
+ range_boundaries->Add(range.to() + 1, zone);
+ }
+ int end_index = range_boundaries->length() - 1;
+ if (range_boundaries->at(end_index) > max_char) {
+ end_index--;
+ }
+
+ Label fall_through;
+ GenerateBranches(macro_assembler, range_boundaries,
+ 0, // start_index.
+ end_index,
+ 0, // min_char.
+ max_char, &fall_through,
+ zeroth_entry_is_failure ? &fall_through : on_failure,
+ zeroth_entry_is_failure ? on_failure : &fall_through);
+ macro_assembler->Bind(&fall_through);
+}
+
+} // namespace
+
+RegExpNode::~RegExpNode() = default;
+
+RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
+ Trace* trace) {
+ // If we are generating a greedy loop then don't stop and don't reuse code.
+ if (trace->stop_node() != nullptr) {
+ return CONTINUE;
+ }
+
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ if (trace->is_trivial()) {
+ if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
+ // If a generic version is already scheduled to be generated or we have
+ // recursed too deeply then just generate a jump to that code.
+ macro_assembler->GoTo(&label_);
+ // This will queue it up for generation of a generic version if it hasn't
+ // already been queued.
+ compiler->AddWork(this);
+ return DONE;
+ }
+ // Generate generic version of the node and bind the label for later use.
+ macro_assembler->Bind(&label_);
+ return CONTINUE;
+ }
+
+ // We are being asked to make a non-generic version. Keep track of how many
+ // non-generic versions we generate so as not to overdo it.
+ trace_count_++;
+ if (KeepRecursing(compiler) && compiler->optimize() &&
+ trace_count_ < kMaxCopiesCodeGenerated) {
+ return CONTINUE;
+ }
+
+ // If we get here code has been generated for this node too many times or
+ // recursion is too deep. Time to switch to a generic version. The code for
+ // generic versions above can handle deep recursion properly.
+ bool was_limiting = compiler->limiting_recursion();
+ compiler->set_limiting_recursion(true);
+ trace->Flush(compiler, this);
+ compiler->set_limiting_recursion(was_limiting);
+ return DONE;
+}
+
+bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
+ return !compiler->limiting_recursion() &&
+ compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
+}
+
+void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) {
+ if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
+ // Anything may follow a positive submatch success, thus we need to accept
+ // all characters from this position onwards.
+ bm->SetRest(offset);
+ } else {
+ on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+ }
+ SaveBMInfo(bm, not_at_start, offset);
+}
+
+void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int filled_in,
+ bool not_at_start) {
+ if (action_type_ == SET_REGISTER_FOR_LOOP) {
+ on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
+ filled_in, not_at_start);
+ } else {
+ on_success()->GetQuickCheckDetails(details, compiler, filled_in,
+ not_at_start);
+ }
+}
+
+void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) {
+ // Match the behaviour of EatsAtLeast on this node.
+ if (assertion_type() == AT_START && not_at_start) return;
+ on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+ SaveBMInfo(bm, not_at_start, offset);
+}
+
+void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
+ QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
+ bool not_at_start) {
+ RegExpNode* node = continue_node();
+ return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
+}
+
+namespace {
+
+// Takes the left-most 1-bit and smears it out, setting all bits to its right.
+inline uint32_t SmearBitsRight(uint32_t v) {
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return v;
+}
+
+} // namespace
+
+bool QuickCheckDetails::Rationalize(bool asc) {
+ bool found_useful_op = false;
+ const uint32_t char_mask = CharMask(asc);
+ mask_ = 0;
+ value_ = 0;
+ int char_shift = 0;
+ for (int i = 0; i < characters_; i++) {
+ Position* pos = &positions_[i];
+ if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
+ found_useful_op = true;
+ }
+ mask_ |= (pos->mask & char_mask) << char_shift;
+ value_ |= (pos->value & char_mask) << char_shift;
+ char_shift += asc ? 8 : 16;
+ }
+ return found_useful_op;
+}
+
+int RegExpNode::EatsAtLeast(bool not_at_start) {
+ return not_at_start ? eats_at_least_.eats_at_least_from_not_start
+ : eats_at_least_.eats_at_least_from_possibly_start;
+}
+
+EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
+ // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
+ // implies that the following node must be a LoopChoiceNode. If we need to
+ // set registers to constant values for other reasons, we could introduce a
+ // new action type SET_REGISTER that doesn't imply anything about its
+ // successor.
+ UNREACHABLE();
+}
+
+void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start) {
+ // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
+ UNREACHABLE();
+}
+
+EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
+ DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
+
+ if (read_backward()) {
+ // The eats_at_least value is not used if reading backward. The
+ // EatsAtLeastPropagator should've zeroed it as well.
+ DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
+ DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
+ return {};
+ }
+
+ // Figure out how much the loop body itself eats, not including anything in
+ // the continuation case. In general, the nodes in the loop body should report
+ // that they eat at least the number eaten by the continuation node, since any
+ // successful match in the loop body must also include the continuation node.
+ // However, in some cases involving positive lookaround, the loop body under-
+ // reports its appetite, so use saturated math here to avoid negative numbers.
+ uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
+ loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
+ uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
+ loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
+
+ // Limit the number of loop iterations to avoid overflow in subsequent steps.
+ int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
+
+ EatsAtLeastInfo result;
+ result.eats_at_least_from_not_start =
+ base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
+ continue_node_->EatsAtLeast(true));
+ if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
+ // First loop iteration eats at least one, so all subsequent iterations
+ // and the after-loop chunk are guaranteed to not be at the start.
+ result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
+ loop_body_from_possibly_start +
+ (loop_iterations - 1) * loop_body_from_not_start +
+ continue_node_->EatsAtLeast(true));
+ } else {
+ // Loop body might eat nothing, so only continue node contributes.
+ result.eats_at_least_from_possibly_start =
+ continue_node_->EatsAtLeast(false);
+ }
+ return result;
+}
+
+bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
+ Trace* bounds_check_trace, Trace* trace,
+ bool preload_has_checked_bounds,
+ Label* on_possible_success,
+ QuickCheckDetails* details,
+ bool fall_through_on_failure,
+ ChoiceNode* predecessor) {
+ DCHECK_NOT_NULL(predecessor);
+ if (details->characters() == 0) return false;
+ GetQuickCheckDetails(details, compiler, 0,
+ trace->at_start() == Trace::FALSE_VALUE);
+ if (details->cannot_match()) return false;
+ if (!details->Rationalize(compiler->one_byte())) return false;
+ DCHECK(details->characters() == 1 ||
+ compiler->macro_assembler()->CanReadUnaligned());
+ uint32_t mask = details->mask();
+ uint32_t value = details->value();
+
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+
+ if (trace->characters_preloaded() != details->characters()) {
+ DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
+ // The bounds check is performed using the minimum number of characters
+ // any choice would eat, so if the bounds check fails, then none of the
+ // choices can succeed, so we can just immediately backtrack, rather
+ // than go to the next choice. The number of characters preloaded may be
+ // less than the number used for the bounds check.
+ int eats_at_least = predecessor->EatsAtLeast(
+ bounds_check_trace->at_start() == Trace::FALSE_VALUE);
+ DCHECK_GE(eats_at_least, details->characters());
+ assembler->LoadCurrentCharacter(
+ trace->cp_offset(), bounds_check_trace->backtrack(),
+ !preload_has_checked_bounds, details->characters(), eats_at_least);
+ }
+
+ bool need_mask = true;
+
+ if (details->characters() == 1) {
+ // If number of characters preloaded is 1 then we used a byte or 16 bit
+ // load so the value is already masked down.
+ const uint32_t char_mask = CharMask(compiler->one_byte());
+ if ((mask & char_mask) == char_mask) need_mask = false;
+ mask &= char_mask;
+ } else {
+ // For 2-character preloads in one-byte mode or 1-character preloads in
+ // two-byte mode we also use a 16 bit load with zero extend.
+ static const uint32_t kTwoByteMask = 0xFFFF;
+ static const uint32_t kFourByteMask = 0xFFFFFFFF;
+ if (details->characters() == 2 && compiler->one_byte()) {
+ if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
+ } else if (details->characters() == 1 && !compiler->one_byte()) {
+ if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
+ } else {
+ if (mask == kFourByteMask) need_mask = false;
+ }
+ }
+
+ if (fall_through_on_failure) {
+ if (need_mask) {
+ assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
+ } else {
+ assembler->CheckCharacter(value, on_possible_success);
+ }
+ } else {
+ if (need_mask) {
+ assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
+ } else {
+ assembler->CheckNotCharacter(value, trace->backtrack());
+ }
+ }
+ return true;
+}
+
+// Here is the meat of GetQuickCheckDetails (see also the comment on the
+// super-class in the .h file).
+//
+// We iterate along the text object, building up for each character a
+// mask and value that can be used to test for a quick failure to match.
+// The masks and values for the positions will be combined into a single
+// machine word for the current character width in order to be used in
+// generating a quick check.
+void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start) {
+ // Do not collect any quick check details if the text node reads backward,
+ // since it reads in the opposite direction than we use for quick checks.
+ if (read_backward()) return;
+ Isolate* isolate = compiler->macro_assembler()->isolate();
+ DCHECK(characters_filled_in < details->characters());
+ int characters = details->characters();
+ const uint32_t char_mask = CharMask(compiler->one_byte());
+ for (int k = 0; k < elements()->length(); k++) {
+ TextElement elm = elements()->at(k);
+ if (elm.text_type() == TextElement::ATOM) {
+ base::Vector<const base::uc16> quarks = elm.atom()->data();
+ for (int i = 0; i < characters && i < quarks.length(); i++) {
+ QuickCheckDetails::Position* pos =
+ details->positions(characters_filled_in);
+ base::uc16 c = quarks[i];
+ if (IsIgnoreCase(compiler->flags())) {
+ unibrow::uchar chars[4];
+ int length = GetCaseIndependentLetters(
+ isolate, c, compiler->one_byte(), chars, 4);
+ if (length == 0) {
+ // This can happen because all case variants are non-Latin1, but we
+ // know the input is Latin1.
+ details->set_cannot_match();
+ pos->determines_perfectly = false;
+ return;
+ }
+ if (length == 1) {
+ // This letter has no case equivalents, so it's nice and simple
+ // and the mask-compare will determine definitely whether we have
+ // a match at this character position.
+ pos->mask = char_mask;
+ pos->value = chars[0];
+ pos->determines_perfectly = true;
+ } else {
+ uint32_t common_bits = char_mask;
+ uint32_t bits = chars[0];
+ for (int j = 1; j < length; j++) {
+ uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
+ common_bits ^= differing_bits;
+ bits &= common_bits;
+ }
+ // If length is 2 and common bits has only one zero in it then
+ // our mask and compare instruction will determine definitely
+ // whether we have a match at this character position. Otherwise
+ // it can only be an approximate check.
+ uint32_t one_zero = (common_bits | ~char_mask);
+ if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
+ pos->determines_perfectly = true;
+ }
+ pos->mask = common_bits;
+ pos->value = bits;
+ }
+ } else {
+ // Don't ignore case. Nice simple case where the mask-compare will
+ // determine definitely whether we have a match at this character
+ // position.
+ if (c > char_mask) {
+ details->set_cannot_match();
+ pos->determines_perfectly = false;
+ return;
+ }
+ pos->mask = char_mask;
+ pos->value = c;
+ pos->determines_perfectly = true;
+ }
+ characters_filled_in++;
+ DCHECK(characters_filled_in <= details->characters());
+ if (characters_filled_in == details->characters()) {
+ return;
+ }
+ }
+ } else {
+ QuickCheckDetails::Position* pos =
+ details->positions(characters_filled_in);
+ RegExpClassRanges* tree = elm.class_ranges();
+ ZoneList<CharacterRange>* ranges = tree->ranges(zone());
+ if (tree->is_negated() || ranges->is_empty()) {
+ // A quick check uses multi-character mask and compare. There is no
+ // useful way to incorporate a negative char class into this scheme
+ // so we just conservatively create a mask and value that will always
+ // succeed.
+ // Likewise for empty ranges (empty ranges can occur e.g. when
+ // compiling for one-byte subjects and impossible (non-one-byte) ranges
+ // have been removed).
+ pos->mask = 0;
+ pos->value = 0;
+ } else {
+ int first_range = 0;
+ while (ranges->at(first_range).from() > char_mask) {
+ first_range++;
+ if (first_range == ranges->length()) {
+ details->set_cannot_match();
+ pos->determines_perfectly = false;
+ return;
+ }
+ }
+ CharacterRange range = ranges->at(first_range);
+ const base::uc32 first_from = range.from();
+ const base::uc32 first_to =
+ (range.to() > char_mask) ? char_mask : range.to();
+ const uint32_t differing_bits = (first_from ^ first_to);
+ // A mask and compare is only perfect if the differing bits form a
+ // number like 00011111 with one single block of trailing 1s.
+ if ((differing_bits & (differing_bits + 1)) == 0 &&
+ first_from + differing_bits == first_to) {
+ pos->determines_perfectly = true;
+ }
+ uint32_t common_bits = ~SmearBitsRight(differing_bits);
+ uint32_t bits = (first_from & common_bits);
+ for (int i = first_range + 1; i < ranges->length(); i++) {
+ range = ranges->at(i);
+ const base::uc32 from = range.from();
+ if (from > char_mask) continue;
+ const base::uc32 to =
+ (range.to() > char_mask) ? char_mask : range.to();
+ // Here we are combining more ranges into the mask and compare
+ // value. With each new range the mask becomes more sparse and
+ // so the chances of a false positive rise. A character class
+ // with multiple ranges is assumed never to be equivalent to a
+ // mask and compare operation.
+ pos->determines_perfectly = false;
+ uint32_t new_common_bits = (from ^ to);
+ new_common_bits = ~SmearBitsRight(new_common_bits);
+ common_bits &= new_common_bits;
+ bits &= new_common_bits;
+ uint32_t new_differing_bits = (from & common_bits) ^ bits;
+ common_bits ^= new_differing_bits;
+ bits &= common_bits;
+ }
+ pos->mask = common_bits;
+ pos->value = bits;
+ }
+ characters_filled_in++;
+ DCHECK(characters_filled_in <= details->characters());
+ if (characters_filled_in == details->characters()) return;
+ }
+ }
+ DCHECK(characters_filled_in != details->characters());
+ if (!details->cannot_match()) {
+ on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
+ true);
+ }
+}
+
+void QuickCheckDetails::Clear() {
+ for (int i = 0; i < characters_; i++) {
+ positions_[i].mask = 0;
+ positions_[i].value = 0;
+ positions_[i].determines_perfectly = false;
+ }
+ characters_ = 0;
+}
+
+void QuickCheckDetails::Advance(int by, bool one_byte) {
+ if (by >= characters_ || by < 0) {
+ DCHECK_IMPLIES(by < 0, characters_ == 0);
+ Clear();
+ return;
+ }
+ DCHECK_LE(characters_ - by, 4);
+ DCHECK_LE(characters_, 4);
+ for (int i = 0; i < characters_ - by; i++) {
+ positions_[i] = positions_[by + i];
+ }
+ for (int i = characters_ - by; i < characters_; i++) {
+ positions_[i].mask = 0;
+ positions_[i].value = 0;
+ positions_[i].determines_perfectly = false;
+ }
+ characters_ -= by;
+ // We could change mask_ and value_ here but we would never advance unless
+ // they had already been used in a check and they won't be used again because
+ // it would gain us nothing. So there's no point.
+}
+
+void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
+ DCHECK(characters_ == other->characters_);
+ if (other->cannot_match_) {
+ return;
+ }
+ if (cannot_match_) {
+ *this = *other;
+ return;
+ }
+ for (int i = from_index; i < characters_; i++) {
+ QuickCheckDetails::Position* pos = positions(i);
+ QuickCheckDetails::Position* other_pos = other->positions(i);
+ if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
+ !other_pos->determines_perfectly) {
+ // Our mask-compare operation will be approximate unless we have the
+ // exact same operation on both sides of the alternation.
+ pos->determines_perfectly = false;
+ }
+ pos->mask &= other_pos->mask;
+ pos->value &= pos->mask;
+ other_pos->value &= pos->mask;
+ uint32_t differing_bits = (pos->value ^ other_pos->value);
+ pos->mask &= ~differing_bits;
+ pos->value &= pos->mask;
+ }
+}
+
+class VisitMarker {
+ public:
+ explicit VisitMarker(NodeInfo* info) : info_(info) {
+ DCHECK(!info->visited);
+ info->visited = true;
+ }
+ ~VisitMarker() { info_->visited = false; }
+
+ private:
+ NodeInfo* info_;
+};
+
+// Temporarily sets traversed_loop_initialization_node_.
+class LoopInitializationMarker {
+ public:
+ explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
+ DCHECK(!node_->traversed_loop_initialization_node_);
+ node_->traversed_loop_initialization_node_ = true;
+ }
+ ~LoopInitializationMarker() {
+ DCHECK(node_->traversed_loop_initialization_node_);
+ node_->traversed_loop_initialization_node_ = false;
+ }
+ LoopInitializationMarker(const LoopInitializationMarker&) = delete;
+ LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
+
+ private:
+ LoopChoiceNode* node_;
+};
+
+// Temporarily decrements min_loop_iterations_.
+class IterationDecrementer {
+ public:
+ explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
+ DCHECK_GT(node_->min_loop_iterations_, 0);
+ --node_->min_loop_iterations_;
+ }
+ ~IterationDecrementer() { ++node_->min_loop_iterations_; }
+ IterationDecrementer(const IterationDecrementer&) = delete;
+ IterationDecrementer& operator=(const IterationDecrementer&) = delete;
+
+ private:
+ LoopChoiceNode* node_;
+};
+
+RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ DCHECK(!info()->visited);
+ VisitMarker marker(info());
+ return FilterSuccessor(depth - 1, flags);
+}
+
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) {
+ RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags);
+ if (next == nullptr) return set_replacement(nullptr);
+ on_success_ = next;
+ return set_replacement(this);
+}
+
+// We need to check for the following characters: 0x39C 0x3BC 0x178.
+bool RangeContainsLatin1Equivalents(CharacterRange range) {
+ // TODO(dcarney): this could be a lot more efficient.
+ return range.Contains(0x039C) || range.Contains(0x03BC) ||
+ range.Contains(0x0178);
+}
+
+namespace {
+
+bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
+ for (int i = 0; i < ranges->length(); i++) {
+ // TODO(dcarney): this could be a lot more efficient.
+ if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
+ }
+ return false;
+}
+
+} // namespace
+
+RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ DCHECK(!info()->visited);
+ VisitMarker marker(info());
+ int element_count = elements()->length();
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elements()->at(i);
+ if (elm.text_type() == TextElement::ATOM) {
+ base::Vector<const base::uc16> quarks = elm.atom()->data();
+ for (int j = 0; j < quarks.length(); j++) {
+ base::uc16 c = quarks[j];
+ if (IsIgnoreCase(flags)) {
+ c = unibrow::Latin1::TryConvertToLatin1(c);
+ }
+ if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
+ // Replace quark in case we converted to Latin-1.
+ base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin());
+ writable_quarks[j] = c;
+ }
+ } else {
+ DCHECK(elm.text_type() == TextElement::CLASS_RANGES);
+ RegExpClassRanges* cr = elm.class_ranges();
+ ZoneList<CharacterRange>* ranges = cr->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+ // Now they are in order so we only need to look at the first.
+ int range_count = ranges->length();
+ if (cr->is_negated()) {
+ if (range_count != 0 && ranges->at(0).from() == 0 &&
+ ranges->at(0).to() >= String::kMaxOneByteCharCode) {
+ // This will be handled in a later filter.
+ if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
+ continue;
+ }
+ return set_replacement(nullptr);
+ }
+ } else {
+ if (range_count == 0 ||
+ ranges->at(0).from() > String::kMaxOneByteCharCode) {
+ // This will be handled in a later filter.
+ if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
+ continue;
+ }
+ return set_replacement(nullptr);
+ }
+ }
+ }
+ }
+ return FilterSuccessor(depth - 1, flags);
+}
+
+RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ {
+ VisitMarker marker(info());
+
+ RegExpNode* continue_replacement =
+ continue_node_->FilterOneByte(depth - 1, flags);
+ // If we can't continue after the loop then there is no sense in doing the
+ // loop.
+ if (continue_replacement == nullptr) return set_replacement(nullptr);
+ }
+
+ return ChoiceNode::FilterOneByte(depth - 1, flags);
+}
+
+RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+ int choice_count = alternatives_->length();
+
+ for (int i = 0; i < choice_count; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ if (alternative.guards() != nullptr &&
+ alternative.guards()->length() != 0) {
+ set_replacement(this);
+ return this;
+ }
+ }
+
+ int surviving = 0;
+ RegExpNode* survivor = nullptr;
+ for (int i = 0; i < choice_count; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ RegExpNode* replacement =
+ alternative.node()->FilterOneByte(depth - 1, flags);
+ DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
+ if (replacement != nullptr) {
+ alternatives_->at(i).set_node(replacement);
+ surviving++;
+ survivor = replacement;
+ }
+ }
+ if (surviving < 2) return set_replacement(survivor);
+
+ set_replacement(this);
+ if (surviving == choice_count) {
+ return this;
+ }
+ // Only some of the nodes survived the filtering. We need to rebuild the
+ // alternatives list.
+ ZoneList<GuardedAlternative>* new_alternatives =
+ zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
+ for (int i = 0; i < choice_count; i++) {
+ RegExpNode* replacement =
+ alternatives_->at(i).node()->FilterOneByte(depth - 1, flags);
+ if (replacement != nullptr) {
+ alternatives_->at(i).set_node(replacement);
+ new_alternatives->Add(alternatives_->at(i), zone());
+ }
+ }
+ alternatives_ = new_alternatives;
+ return this;
+}
+
+RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
+ RegExpFlags flags) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+ // Alternative 0 is the negative lookahead, alternative 1 is what comes
+ // afterwards.
+ RegExpNode* node = continue_node();
+ RegExpNode* replacement = node->FilterOneByte(depth - 1, flags);
+ if (replacement == nullptr) return set_replacement(nullptr);
+ alternatives_->at(kContinueIndex).set_node(replacement);
+
+ RegExpNode* neg_node = lookaround_node();
+ RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags);
+ // If the negative lookahead is always going to fail then
+ // we don't need to check it.
+ if (neg_replacement == nullptr) return set_replacement(replacement);
+ alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
+ return set_replacement(this);
+}
+
+void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start) {
+ if (body_can_be_zero_length_ || info()->visited) return;
+ not_at_start = not_at_start || this->not_at_start();
+ DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
+ if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
+ loop_node_->EatsAtLeast(not_at_start) >
+ continue_node_->EatsAtLeast(true)) {
+ // Loop body is guaranteed to execute at least once, and consume characters
+ // when it does, meaning the only possible quick checks from this point
+ // begin with the loop body. We may recursively visit this LoopChoiceNode,
+ // but we temporarily decrease its minimum iteration counter so we know when
+ // to check the continue case.
+ IterationDecrementer next_iteration(this);
+ loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
+ not_at_start);
+ } else {
+ // Might not consume anything in the loop body, so treat it like a normal
+ // ChoiceNode (and don't recursively visit this node again).
+ VisitMarker marker(info());
+ ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
+ not_at_start);
+ }
+}
+
+void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
+ QuickCheckDetails* details, RegExpCompiler* compiler,
+ int characters_filled_in, bool not_at_start) {
+ if (traversed_loop_initialization_node_) {
+ // We already entered this loop once, exited via its continuation node, and
+ // followed an outer loop's back-edge to before the loop entry point. We
+ // could try to reset the minimum iteration count to its starting value at
+ // this point, but that seems like more trouble than it's worth. It's safe
+ // to keep going with the current (possibly reduced) minimum iteration
+ // count.
+ GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
+ } else {
+ // We are entering a loop via its counter initialization action, meaning we
+ // are guaranteed to run the loop body at least some minimum number of times
+ // before running the continuation node. Set a flag so that this node knows
+ // (now and any times we visit it again recursively) that it was entered
+ // from the top.
+ LoopInitializationMarker marker(this);
+ GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
+ }
+}
+
+void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) {
+ if (body_can_be_zero_length_ || budget <= 0) {
+ bm->SetRest(offset);
+ SaveBMInfo(bm, not_at_start, offset);
+ return;
+ }
+ ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+ SaveBMInfo(bm, not_at_start, offset);
+}
+
+void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start) {
+ not_at_start = (not_at_start || not_at_start_);
+ int choice_count = alternatives_->length();
+ DCHECK_LT(0, choice_count);
+ alternatives_->at(0).node()->GetQuickCheckDetails(
+ details, compiler, characters_filled_in, not_at_start);
+ for (int i = 1; i < choice_count; i++) {
+ QuickCheckDetails new_details(details->characters());
+ RegExpNode* node = alternatives_->at(i).node();
+ node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
+ not_at_start);
+ // Here we merge the quick match details of the two branches.
+ details->Merge(&new_details, characters_filled_in);
+ }
+}
+
+namespace {
+
+// Check for [0-9A-Z_a-z].
+void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
+ Label* non_word, bool fall_through_on_word) {
+ if (assembler->CheckSpecialClassRanges(
+ fall_through_on_word ? StandardCharacterSet::kWord
+ : StandardCharacterSet::kNotWord,
+ fall_through_on_word ? non_word : word)) {
+ // Optimized implementation available.
+ return;
+ }
+ assembler->CheckCharacterGT('z', non_word);
+ assembler->CheckCharacterLT('0', non_word);
+ assembler->CheckCharacterGT('a' - 1, word);
+ assembler->CheckCharacterLT('9' + 1, word);
+ assembler->CheckCharacterLT('A', non_word);
+ assembler->CheckCharacterLT('Z' + 1, word);
+ if (fall_through_on_word) {
+ assembler->CheckNotCharacter('_', non_word);
+ } else {
+ assembler->CheckCharacter('_', word);
+ }
+}
+
+// Emit the code to check for a ^ in multiline mode (1-character lookbehind
+// that matches newline or the start of input).
+void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+
+ // We will load the previous character into the current character register.
+ Trace new_trace(*trace);
+ new_trace.InvalidateCurrentCharacter();
+
+ // A positive (> 0) cp_offset means we've already successfully matched a
+ // non-empty-width part of the pattern, and thus cannot be at or before the
+ // start of the subject string. We can thus skip both at-start and
+ // bounds-checks when loading the one-character lookbehind.
+ const bool may_be_at_or_before_subject_string_start =
+ new_trace.cp_offset() <= 0;
+
+ Label ok;
+ if (may_be_at_or_before_subject_string_start) {
+ // The start of input counts as a newline in this context, so skip to ok if
+ // we are at the start.
+ assembler->CheckAtStart(new_trace.cp_offset(), &ok);
+ }
+
+ // If we've already checked that we are not at the start of input, it's okay
+ // to load the previous character without bounds checks.
+ const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
+ new_trace.backtrack(), can_skip_bounds_check);
+ if (!assembler->CheckSpecialClassRanges(StandardCharacterSet::kLineTerminator,
+ new_trace.backtrack())) {
+ // Newline means \n, \r, 0x2028 or 0x2029.
+ if (!compiler->one_byte()) {
+ assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
+ }
+ assembler->CheckCharacter('\n', &ok);
+ assembler->CheckNotCharacter('\r', new_trace.backtrack());
+ }
+ assembler->Bind(&ok);
+ on_success->Emit(compiler, &new_trace);
+}
+
+} // namespace
+
+// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
+void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ Isolate* isolate = assembler->isolate();
+ Trace::TriBool next_is_word_character = Trace::UNKNOWN;
+ bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
+ BoyerMooreLookahead* lookahead = bm_info(not_at_start);
+ if (lookahead == nullptr) {
+ int eats_at_least =
+ std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
+ if (eats_at_least >= 1) {
+ BoyerMooreLookahead* bm =
+ zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
+ FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
+ if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
+ if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
+ }
+ } else {
+ if (lookahead->at(0)->is_non_word())
+ next_is_word_character = Trace::FALSE_VALUE;
+ if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
+ }
+ bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
+ if (next_is_word_character == Trace::UNKNOWN) {
+ Label before_non_word;
+ Label before_word;
+ if (trace->characters_preloaded() != 1) {
+ assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
+ }
+ // Fall through on non-word.
+ EmitWordCheck(assembler, &before_word, &before_non_word, false);
+ // Next character is not a word character.
+ assembler->Bind(&before_non_word);
+ Label ok;
+ BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
+ assembler->GoTo(&ok);
+
+ assembler->Bind(&before_word);
+ BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
+ assembler->Bind(&ok);
+ } else if (next_is_word_character == Trace::TRUE_VALUE) {
+ BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
+ } else {
+ DCHECK(next_is_word_character == Trace::FALSE_VALUE);
+ BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
+ }
+}
+
+void AssertionNode::BacktrackIfPrevious(
+ RegExpCompiler* compiler, Trace* trace,
+ AssertionNode::IfPrevious backtrack_if_previous) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ Trace new_trace(*trace);
+ new_trace.InvalidateCurrentCharacter();
+
+ Label fall_through;
+ Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
+ : &fall_through;
+ Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
+ : new_trace.backtrack();
+
+ // A positive (> 0) cp_offset means we've already successfully matched a
+ // non-empty-width part of the pattern, and thus cannot be at or before the
+ // start of the subject string. We can thus skip both at-start and
+ // bounds-checks when loading the one-character lookbehind.
+ const bool may_be_at_or_before_subject_string_start =
+ new_trace.cp_offset() <= 0;
+
+ if (may_be_at_or_before_subject_string_start) {
+ // The start of input counts as a non-word character, so the question is
+ // decided if we are at the start.
+ assembler->CheckAtStart(new_trace.cp_offset(), non_word);
+ }
+
+ // If we've already checked that we are not at the start of input, it's okay
+ // to load the previous character without bounds checks.
+ const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
+ can_skip_bounds_check);
+ EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
+
+ assembler->Bind(&fall_through);
+ on_success()->Emit(compiler, &new_trace);
+}
+
+void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int filled_in, bool not_at_start) {
+ if (assertion_type_ == AT_START && not_at_start) {
+ details->set_cannot_match();
+ return;
+ }
+ return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
+ not_at_start);
+}
+
+void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ switch (assertion_type_) {
+ case AT_END: {
+ Label ok;
+ assembler->CheckPosition(trace->cp_offset(), &ok);
+ assembler->GoTo(trace->backtrack());
+ assembler->Bind(&ok);
+ break;
+ }
+ case AT_START: {
+ if (trace->at_start() == Trace::FALSE_VALUE) {
+ assembler->GoTo(trace->backtrack());
+ return;
+ }
+ if (trace->at_start() == Trace::UNKNOWN) {
+ assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
+ Trace at_start_trace = *trace;
+ at_start_trace.set_at_start(Trace::TRUE_VALUE);
+ on_success()->Emit(compiler, &at_start_trace);
+ return;
+ }
+ } break;
+ case AFTER_NEWLINE:
+ EmitHat(compiler, on_success(), trace);
+ return;
+ case AT_BOUNDARY:
+ case AT_NON_BOUNDARY: {
+ EmitBoundaryCheck(compiler, trace);
+ return;
+ }
+ }
+ on_success()->Emit(compiler, trace);
+}
+
+namespace {
+
+bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
+ if (quick_check == nullptr) return false;
+ if (offset >= quick_check->characters()) return false;
+ return quick_check->positions(offset)->determines_perfectly;
+}
+
+void UpdateBoundsCheck(int index, int* checked_up_to) {
+ if (index > *checked_up_to) {
+ *checked_up_to = index;
+ }
+}
+
+} // namespace
+
+// We call this repeatedly to generate code for each pass over the text node.
+// The passes are in increasing order of difficulty because we hope one
+// of the first passes will fail in which case we are saved the work of the
+// later passes. for example for the case independent regexp /%[asdfghjkl]a/
+// we will check the '%' in the first pass, the case independent 'a' in the
+// second pass and the character class in the last pass.
+//
+// The passes are done from right to left, so for example to test for /bar/
+// we will first test for an 'r' with offset 2, then an 'a' with offset 1
+// and then a 'b' with offset 0. This means we can avoid the end-of-input
+// bounds check most of the time. In the example we only need to check for
+// end-of-input when loading the putative 'r'.
+//
+// A slight complication involves the fact that the first character may already
+// be fetched into a register by the previous node. In this case we want to
+// do the test for that character first. We do this in separate passes. The
+// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
+// pass has been performed then subsequent passes will have true in
+// first_element_checked to indicate that that character does not need to be
+// checked again.
+//
+// In addition to all this we are passed a Trace, which can
+// contain an AlternativeGeneration object. In this AlternativeGeneration
+// object we can see details of any quick check that was already passed in
+// order to get to the code we are now generating. The quick check can involve
+// loading characters, which means we do not need to recheck the bounds
+// up to the limit the quick check already checked. In addition the quick
+// check can have involved a mask and compare operation which may simplify
+// or obviate the need for further checks at some character positions.
+void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
+ bool preloaded, Trace* trace,
+ bool first_element_checked, int* checked_up_to) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ Isolate* isolate = assembler->isolate();
+ bool one_byte = compiler->one_byte();
+ Label* backtrack = trace->backtrack();
+ QuickCheckDetails* quick_check = trace->quick_check_performed();
+ int element_count = elements()->length();
+ int backward_offset = read_backward() ? -Length() : 0;
+ for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
+ TextElement elm = elements()->at(i);
+ int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
+ if (elm.text_type() == TextElement::ATOM) {
+ if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue;
+ base::Vector<const base::uc16> quarks = elm.atom()->data();
+ for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
+ if (first_element_checked && i == 0 && j == 0) continue;
+ if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
+ base::uc16 quark = quarks[j];
+ if (IsIgnoreCase(compiler->flags())) {
+ // Everywhere else we assume that a non-Latin-1 character cannot match
+ // a Latin-1 character. Avoid the cases where this is assumption is
+ // invalid by using the Latin1 equivalent instead.
+ quark = unibrow::Latin1::TryConvertToLatin1(quark);
+ }
+ bool needs_bounds_check =
+ *checked_up_to < cp_offset + j || read_backward();
+ bool bounds_checked = false;
+ switch (pass) {
+ case NON_LATIN1_MATCH:
+ DCHECK(one_byte);
+ if (quark > String::kMaxOneByteCharCode) {
+ assembler->GoTo(backtrack);
+ return;
+ }
+ break;
+ case NON_LETTER_CHARACTER_MATCH:
+ bounds_checked =
+ EmitAtomNonLetter(isolate, compiler, quark, backtrack,
+ cp_offset + j, needs_bounds_check, preloaded);
+ break;
+ case SIMPLE_CHARACTER_MATCH:
+ bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
+ backtrack, cp_offset + j,
+ needs_bounds_check, preloaded);
+ break;
+ case CASE_CHARACTER_MATCH:
+ bounds_checked =
+ EmitAtomLetter(isolate, compiler, quark, backtrack,
+ cp_offset + j, needs_bounds_check, preloaded);
+ break;
+ default:
+ break;
+ }
+ if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
+ }
+ } else {
+ DCHECK_EQ(TextElement::CLASS_RANGES, elm.text_type());
+ if (pass == CHARACTER_CLASS_MATCH) {
+ if (first_element_checked && i == 0) continue;
+ if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
+ RegExpClassRanges* cr = elm.class_ranges();
+ bool bounds_check = *checked_up_to < cp_offset || read_backward();
+ EmitClassRanges(assembler, cr, one_byte, backtrack, cp_offset,
+ bounds_check, preloaded, zone());
+ UpdateBoundsCheck(cp_offset, checked_up_to);
+ }
+ }
+ }
+}
+
+int TextNode::Length() {
+ TextElement elm = elements()->last();
+ DCHECK_LE(0, elm.cp_offset());
+ return elm.cp_offset() + elm.length();
+}
+
+bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
+ if (ignore_case) {
+ return pass == SIMPLE_CHARACTER_MATCH;
+ } else {
+ return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
+ }
+}
+
+TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool read_backward,
+ RegExpNode* on_success) {
+ DCHECK_NOT_NULL(ranges);
+ // TODO(jgruber): There's no fundamental need to create this
+ // RegExpClassRanges; we could refactor to avoid the allocation.
+ return zone->New<TextNode>(zone->New<RegExpClassRanges>(zone, ranges),
+ read_backward, on_success);
+}
+
+TextNode* TextNode::CreateForSurrogatePair(
+ Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
+ bool read_backward, RegExpNode* on_success) {
+ ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
+ ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
+ elms->Add(
+ TextElement::ClassRanges(zone->New<RegExpClassRanges>(zone, lead_ranges)),
+ zone);
+ elms->Add(TextElement::ClassRanges(
+ zone->New<RegExpClassRanges>(zone, trail_ranges)),
+ zone);
+ return zone->New<TextNode>(elms, read_backward, on_success);
+}
+
+TextNode* TextNode::CreateForSurrogatePair(
+ Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
+ bool read_backward, RegExpNode* on_success) {
+ ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
+ ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
+ elms->Add(
+ TextElement::ClassRanges(zone->New<RegExpClassRanges>(zone, lead_ranges)),
+ zone);
+ elms->Add(TextElement::ClassRanges(
+ zone->New<RegExpClassRanges>(zone, trail_ranges)),
+ zone);
+ return zone->New<TextNode>(elms, read_backward, on_success);
+}
+
+// This generates the code to match a text node. A text node can contain
+// straight character sequences (possibly to be matched in a case-independent
+// way) and character classes. For efficiency we do not do this in a single
+// pass from left to right. Instead we pass over the text node several times,
+// emitting code for some character positions every time. See the comment on
+// TextEmitPass for details.
+void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ LimitResult limit_result = LimitVersions(compiler, trace);
+ if (limit_result == DONE) return;
+ DCHECK(limit_result == CONTINUE);
+
+ if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
+ compiler->SetRegExpTooBig();
+ return;
+ }
+
+ if (compiler->one_byte()) {
+ int dummy = 0;
+ TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
+ }
+
+ bool first_elt_done = false;
+ int bound_checked_to = trace->cp_offset() - 1;
+ bound_checked_to += trace->bound_checked_up_to();
+
+ // If a character is preloaded into the current character register then
+ // check that now.
+ if (trace->characters_preloaded() == 1) {
+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
+ TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
+ false, &bound_checked_to);
+ }
+ first_elt_done = true;
+ }
+
+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
+ TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
+ first_elt_done, &bound_checked_to);
+ }
+
+ Trace successor_trace(*trace);
+ // If we advance backward, we may end up at the start.
+ successor_trace.AdvanceCurrentPositionInTrace(
+ read_backward() ? -Length() : Length(), compiler);
+ successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
+ : Trace::FALSE_VALUE);
+ RecursionCheck rc(compiler);
+ on_success()->Emit(compiler, &successor_trace);
+}
+
+void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
+
+void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
+ // We don't have an instruction for shifting the current character register
+ // down or for using a shifted value for anything so lets just forget that
+ // we preloaded any characters into it.
+ characters_preloaded_ = 0;
+ // Adjust the offsets of the quick check performed information. This
+ // information is used to find out what we already determined about the
+ // characters by means of mask and compare.
+ quick_check_performed_.Advance(by, compiler->one_byte());
+ cp_offset_ += by;
+ if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
+ compiler->SetRegExpTooBig();
+ cp_offset_ = 0;
+ }
+ bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
+}
+
+void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
+ RegExpFlags flags) {
+ if (!IsIgnoreCase(flags)) return;
+#ifdef V8_INTL_SUPPORT
+ if (NeedsUnicodeCaseEquivalents(flags)) return;
+#endif
+
+ int element_count = elements()->length();
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elements()->at(i);
+ if (elm.text_type() == TextElement::CLASS_RANGES) {
+ RegExpClassRanges* cr = elm.class_ranges();
+ // None of the standard character classes is different in the case
+ // independent case and it slows us down if we don't know that.
+ if (cr->is_standard(zone())) continue;
+ ZoneList<CharacterRange>* ranges = cr->ranges(zone());
+ CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
+ }
+ }
+}
+
+int TextNode::GreedyLoopTextLength() { return Length(); }
+
+RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) {
+ if (read_backward()) return nullptr;
+ if (elements()->length() != 1) return nullptr;
+ TextElement elm = elements()->at(0);
+ if (elm.text_type() != TextElement::CLASS_RANGES) return nullptr;
+ RegExpClassRanges* node = elm.class_ranges();
+ ZoneList<CharacterRange>* ranges = node->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+ if (node->is_negated()) {
+ return ranges->length() == 0 ? on_success() : nullptr;
+ }
+ if (ranges->length() != 1) return nullptr;
+ const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
+ return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
+}
+
+// Finds the fixed match length of a sequence of nodes that goes from
+// this alternative and back to this choice node. If there are variable
+// length nodes or other complications in the way then return a sentinel
+// value indicating that a greedy loop cannot be constructed.
+int ChoiceNode::GreedyLoopTextLengthForAlternative(
+ GuardedAlternative* alternative) {
+ int length = 0;
+ RegExpNode* node = alternative->node();
+ // Later we will generate code for all these text nodes using recursion
+ // so we have to limit the max number.
+ int recursion_depth = 0;
+ while (node != this) {
+ if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
+ return kNodeIsTooComplexForGreedyLoops;
+ }
+ int node_length = node->GreedyLoopTextLength();
+ if (node_length == kNodeIsTooComplexForGreedyLoops) {
+ return kNodeIsTooComplexForGreedyLoops;
+ }
+ length += node_length;
+ SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
+ node = seq_node->on_success();
+ }
+ if (read_backward()) {
+ length = -length;
+ }
+ // Check that we can jump by the whole text length. If not, return sentinel
+ // to indicate the we can't construct a greedy loop.
+ if (length < RegExpMacroAssembler::kMinCPOffset ||
+ length > RegExpMacroAssembler::kMaxCPOffset) {
+ return kNodeIsTooComplexForGreedyLoops;
+ }
+ return length;
+}
+
+void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
+ DCHECK_NULL(loop_node_);
+ AddAlternative(alt);
+ loop_node_ = alt.node();
+}
+
+void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
+ DCHECK_NULL(continue_node_);
+ AddAlternative(alt);
+ continue_node_ = alt.node();
+}
+
+void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ if (trace->stop_node() == this) {
+ // Back edge of greedy optimized loop node graph.
+ int text_length =
+ GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
+ DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
+ // Update the counter-based backtracking info on the stack. This is an
+ // optimization for greedy loops (see below).
+ DCHECK(trace->cp_offset() == text_length);
+ macro_assembler->AdvanceCurrentPosition(text_length);
+ macro_assembler->GoTo(trace->loop_label());
+ return;
+ }
+ DCHECK_NULL(trace->stop_node());
+ if (!trace->is_trivial()) {
+ trace->Flush(compiler, this);
+ return;
+ }
+ ChoiceNode::Emit(compiler, trace);
+}
+
+int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
+ int eats_at_least) {
+ int preload_characters = std::min(4, eats_at_least);
+ DCHECK_LE(preload_characters, 4);
+ if (compiler->macro_assembler()->CanReadUnaligned()) {
+ bool one_byte = compiler->one_byte();
+ if (one_byte) {
+ // We can't preload 3 characters because there is no machine instruction
+ // to do that. We can't just load 4 because we could be reading
+ // beyond the end of the string, which could cause a memory fault.
+ if (preload_characters == 3) preload_characters = 2;
+ } else {
+ if (preload_characters > 2) preload_characters = 2;
+ }
+ } else {
+ if (preload_characters > 1) preload_characters = 1;
+ }
+ return preload_characters;
+}
+
+// This class is used when generating the alternatives in a choice node. It
+// records the way the alternative is being code generated.
+class AlternativeGeneration : public Malloced {
+ public:
+ AlternativeGeneration()
+ : possible_success(),
+ expects_preload(false),
+ after(),
+ quick_check_details() {}
+ Label possible_success;
+ bool expects_preload;
+ Label after;
+ QuickCheckDetails quick_check_details;
+};
+
+// Creates a list of AlternativeGenerations. If the list has a reasonable
+// size then it is on the stack, otherwise the excess is on the heap.
+class AlternativeGenerationList {
+ public:
+ AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
+ for (int i = 0; i < count && i < kAFew; i++) {
+ alt_gens_.Add(a_few_alt_gens_ + i, zone);
+ }
+ for (int i = kAFew; i < count; i++) {
+ alt_gens_.Add(new AlternativeGeneration(), zone);
+ }
+ }
+ ~AlternativeGenerationList() {
+ for (int i = kAFew; i < alt_gens_.length(); i++) {
+ delete alt_gens_[i];
+ alt_gens_[i] = nullptr;
+ }
+ }
+
+ AlternativeGeneration* at(int i) { return alt_gens_[i]; }
+
+ private:
+ static const int kAFew = 10;
+ ZoneList<AlternativeGeneration*> alt_gens_;
+ AlternativeGeneration a_few_alt_gens_[kAFew];
+};
+
+void BoyerMoorePositionInfo::Set(int character) {
+ SetInterval(Interval(character, character));
+}
+
+namespace {
+
+ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
+ int ranges_length, Interval new_range) {
+ DCHECK_EQ(1, ranges_length & 1);
+ DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
+ if (containment == kLatticeUnknown) return containment;
+ bool inside = false;
+ int last = 0;
+ for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
+ // Consider the range from last to ranges[i].
+ // We haven't got to the new range yet.
+ if (ranges[i] <= new_range.from()) continue;
+ // New range is wholly inside last-ranges[i]. Note that new_range.to() is
+ // inclusive, but the values in ranges are not.
+ if (last <= new_range.from() && new_range.to() < ranges[i]) {
+ return Combine(containment, inside ? kLatticeIn : kLatticeOut);
+ }
+ return kLatticeUnknown;
+ }
+ return containment;
+}
+
+int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
+ static_assert(BoyerMoorePositionInfo::kMapSize ==
+ 2 * kInt64Size * kBitsPerByte);
+
+ // Slight fiddling is needed here, since the bitset is of length 128 while
+ // CountTrailingZeros requires an integral type and std::bitset can only
+ // convert to unsigned long long. So we handle the most- and least-significant
+ // bits separately.
+
+ {
+ static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
+ BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
+ static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
+ uint64_t lsb = masked_bitset.to_ullong();
+ if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
+ }
+
+ {
+ BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
+ uint64_t msb = masked_bitset.to_ullong();
+ if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
+ }
+
+ return -1;
+}
+
+} // namespace
+
+void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
+ w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
+
+ if (interval.size() >= kMapSize) {
+ map_count_ = kMapSize;
+ map_.set();
+ return;
+ }
+
+ for (int i = interval.from(); i <= interval.to(); i++) {
+ int mod_character = (i & kMask);
+ if (!map_[mod_character]) {
+ map_count_++;
+ map_.set(mod_character);
+ }
+ if (map_count_ == kMapSize) return;
+ }
+}
+
+void BoyerMoorePositionInfo::SetAll() {
+ w_ = kLatticeUnknown;
+ if (map_count_ != kMapSize) {
+ map_count_ = kMapSize;
+ map_.set();
+ }
+}
+
+BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
+ Zone* zone)
+ : length_(length),
+ compiler_(compiler),
+ max_char_(MaxCodeUnit(compiler->one_byte())) {
+ bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
+ for (int i = 0; i < length; i++) {
+ bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
+ }
+}
+
+// Find the longest range of lookahead that has the fewest number of different
+// characters that can occur at a given position. Since we are optimizing two
+// different parameters at once this is a tradeoff.
+bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
+ int biggest_points = 0;
+ // If more than 32 characters out of 128 can occur it is unlikely that we can
+ // be lucky enough to step forwards much of the time.
+ const int kMaxMax = 32;
+ for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
+ max_number_of_chars *= 2) {
+ biggest_points =
+ FindBestInterval(max_number_of_chars, biggest_points, from, to);
+ }
+ if (biggest_points == 0) return false;
+ return true;
+}
+
+// Find the highest-points range between 0 and length_ where the character
+// information is not too vague. 'Too vague' means that there are more than
+// max_number_of_chars that can occur at this position. Calculates the number
+// of points as the product of width-of-the-range and
+// probability-of-finding-one-of-the-characters, where the probability is
+// calculated using the frequency distribution of the sample subject string.
+int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
+ int old_biggest_points, int* from,
+ int* to) {
+ int biggest_points = old_biggest_points;
+ static const int kSize = RegExpMacroAssembler::kTableSize;
+ for (int i = 0; i < length_;) {
+ while (i < length_ && Count(i) > max_number_of_chars) i++;
+ if (i == length_) break;
+ int remembered_from = i;
+
+ BoyerMoorePositionInfo::Bitset union_bitset;
+ for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
+ union_bitset |= bitmaps_->at(i)->raw_bitset();
+ }
+
+ int frequency = 0;
+
+ // Iterate only over set bits.
+ int j;
+ while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
+ DCHECK(union_bitset[j]); // Sanity check.
+ // Add 1 to the frequency to give a small per-character boost for
+ // the cases where our sampling is not good enough and many
+ // characters have a frequency of zero. This means the frequency
+ // can theoretically be up to 2*kSize though we treat it mostly as
+ // a fraction of kSize.
+ frequency += compiler_->frequency_collator()->Frequency(j) + 1;
+ union_bitset.reset(j);
+ }
+
+ // We use the probability of skipping times the distance we are skipping to
+ // judge the effectiveness of this. Actually we have a cut-off: By
+ // dividing by 2 we switch off the skipping if the probability of skipping
+ // is less than 50%. This is because the multibyte mask-and-compare
+ // skipping in quickcheck is more likely to do well on this case.
+ bool in_quickcheck_range =
+ ((i - remembered_from < 4) ||
+ (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
+ // Called 'probability' but it is only a rough estimate and can actually
+ // be outside the 0-kSize range.
+ int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
+ int points = (i - remembered_from) * probability;
+ if (points > biggest_points) {
+ *from = remembered_from;
+ *to = i - 1;
+ biggest_points = points;
+ }
+ }
+ return biggest_points;
+}
+
+// Take all the characters that will not prevent a successful match if they
+// occur in the subject string in the range between min_lookahead and
+// max_lookahead (inclusive) measured from the current position. If the
+// character at max_lookahead offset is not one of these characters, then we
+// can safely skip forwards by the number of characters in the range.
+int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
+ Handle<ByteArray> boolean_skip_table) {
+ const int kSkipArrayEntry = 0;
+ const int kDontSkipArrayEntry = 1;
+
+ std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
+ boolean_skip_table->length());
+
+ for (int i = max_lookahead; i >= min_lookahead; i--) {
+ BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
+
+ // Iterate only over set bits.
+ int j;
+ while ((j = BitsetFirstSetBit(bitset)) != -1) {
+ DCHECK(bitset[j]); // Sanity check.
+ boolean_skip_table->set(j, kDontSkipArrayEntry);
+ bitset.reset(j);
+ }
+ }
+
+ const int skip = max_lookahead + 1 - min_lookahead;
+ return skip;
+}
+
+// See comment above on the implementation of GetSkipTable.
+void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
+ const int kSize = RegExpMacroAssembler::kTableSize;
+
+ int min_lookahead = 0;
+ int max_lookahead = 0;
+
+ if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
+
+ // Check if we only have a single non-empty position info, and that info
+ // contains precisely one character.
+ bool found_single_character = false;
+ int single_character = 0;
+ for (int i = max_lookahead; i >= min_lookahead; i--) {
+ BoyerMoorePositionInfo* map = bitmaps_->at(i);
+ if (map->map_count() == 0) continue;
+
+ if (found_single_character || map->map_count() > 1) {
+ found_single_character = false;
+ break;
+ }
+
+ DCHECK(!found_single_character);
+ DCHECK_EQ(map->map_count(), 1);
+
+ found_single_character = true;
+ single_character = BitsetFirstSetBit(map->raw_bitset());
+
+ DCHECK_NE(single_character, -1);
+ }
+
+ int lookahead_width = max_lookahead + 1 - min_lookahead;
+
+ if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
+ // The mask-compare can probably handle this better.
+ return;
+ }
+
+ if (found_single_character) {
+ Label cont, again;
+ masm->Bind(&again);
+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+ if (max_char_ > kSize) {
+ masm->CheckCharacterAfterAnd(single_character,
+ RegExpMacroAssembler::kTableMask, &cont);
+ } else {
+ masm->CheckCharacter(single_character, &cont);
+ }
+ masm->AdvanceCurrentPosition(lookahead_width);
+ masm->GoTo(&again);
+ masm->Bind(&cont);
+ return;
+ }
+
+ Factory* factory = masm->isolate()->factory();
+ Handle<ByteArray> boolean_skip_table =
+ factory->NewByteArray(kSize, AllocationType::kOld);
+ int skip_distance =
+ GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
+ DCHECK_NE(0, skip_distance);
+
+ Label cont, again;
+ masm->Bind(&again);
+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+ masm->CheckBitInTable(boolean_skip_table, &cont);
+ masm->AdvanceCurrentPosition(skip_distance);
+ masm->GoTo(&again);
+ masm->Bind(&cont);
+}
+
+/* Code generation for choice nodes.
+ *
+ * We generate quick checks that do a mask and compare to eliminate a
+ * choice. If the quick check succeeds then it jumps to the continuation to
+ * do slow checks and check subsequent nodes. If it fails (the common case)
+ * it falls through to the next choice.
+ *
+ * Here is the desired flow graph. Nodes directly below each other imply
+ * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
+ * 3 doesn't have a quick check so we have to call the slow check.
+ * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
+ * regexp continuation is generated directly after the Sn node, up to the
+ * next GoTo if we decide to reuse some already generated code. Some
+ * nodes expect preload_characters to be preloaded into the current
+ * character register. R nodes do this preloading. Vertices are marked
+ * F for failures and S for success (possible success in the case of quick
+ * nodes). L, V, < and > are used as arrow heads.
+ *
+ * ----------> R
+ * |
+ * V
+ * Q1 -----> S1
+ * | S /
+ * F| /
+ * | F/
+ * | /
+ * | R
+ * | /
+ * V L
+ * Q2 -----> S2
+ * | S /
+ * F| /
+ * | F/
+ * | /
+ * | R
+ * | /
+ * V L
+ * S3
+ * |
+ * F|
+ * |
+ * R
+ * |
+ * backtrack V
+ * <----------Q4
+ * \ F |
+ * \ |S
+ * \ F V
+ * \-----S4
+ *
+ * For greedy loops we push the current position, then generate the code that
+ * eats the input specially in EmitGreedyLoop. The other choice (the
+ * continuation) is generated by the normal code in EmitChoices, and steps back
+ * in the input to the starting position when it fails to match. The loop code
+ * looks like this (U is the unwind code that steps back in the greedy loop).
+ *
+ * _____
+ * / \
+ * V |
+ * ----------> S1 |
+ * /| |
+ * / |S |
+ * F/ \_____/
+ * /
+ * |<-----
+ * | \
+ * V |S
+ * Q2 ---> U----->backtrack
+ * | F /
+ * S| /
+ * V F /
+ * S2--/
+ */
+
+GreedyLoopState::GreedyLoopState(bool not_at_start) {
+ counter_backtrack_trace_.set_backtrack(&label_);
+ if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
+}
+
+void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
+#ifdef DEBUG
+ int choice_count = alternatives_->length();
+ for (int i = 0; i < choice_count - 1; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ ZoneList<Guard*>* guards = alternative.guards();
+ int guard_count = (guards == nullptr) ? 0 : guards->length();
+ for (int j = 0; j < guard_count; j++) {
+ DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
+ }
+ }
+#endif
+}
+
+void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
+ PreloadState* state) {
+ if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
+ // Save some time by looking at most one machine word ahead.
+ state->eats_at_least_ =
+ EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
+ }
+ state->preload_characters_ =
+ CalculatePreloadCharacters(compiler, state->eats_at_least_);
+
+ state->preload_is_current_ =
+ (current_trace->characters_preloaded() == state->preload_characters_);
+ state->preload_has_checked_bounds_ = state->preload_is_current_;
+}
+
+void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ int choice_count = alternatives_->length();
+
+ if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
+ alternatives_->at(0).node()->Emit(compiler, trace);
+ return;
+ }
+
+ AssertGuardsMentionRegisters(trace);
+
+ LimitResult limit_result = LimitVersions(compiler, trace);
+ if (limit_result == DONE) return;
+ DCHECK(limit_result == CONTINUE);
+
+ // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
+ // other choice nodes we only flush if we are out of code size budget.
+ if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
+ trace->Flush(compiler, this);
+ return;
+ }
+
+ RecursionCheck rc(compiler);
+
+ PreloadState preload;
+ preload.init();
+ GreedyLoopState greedy_loop_state(not_at_start());
+
+ int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
+ AlternativeGenerationList alt_gens(choice_count, zone());
+
+ if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
+ trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
+ &greedy_loop_state, text_length);
+ } else {
+ // TODO(erikcorry): Delete this. We don't need this label, but it makes us
+ // match the traces produced pre-cleanup.
+ Label second_choice;
+ compiler->macro_assembler()->Bind(&second_choice);
+
+ preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
+
+ EmitChoices(compiler, &alt_gens, 0, trace, &preload);
+ }
+
+ // At this point we need to generate slow checks for the alternatives where
+ // the quick check was inlined. We can recognize these because the associated
+ // label was bound.
+ int new_flush_budget = trace->flush_budget() / choice_count;
+ for (int i = 0; i < choice_count; i++) {
+ AlternativeGeneration* alt_gen = alt_gens.at(i);
+ Trace new_trace(*trace);
+ // If there are actions to be flushed we have to limit how many times
+ // they are flushed. Take the budget of the parent trace and distribute
+ // it fairly amongst the children.
+ if (new_trace.actions() != nullptr) {
+ new_trace.set_flush_budget(new_flush_budget);
+ }
+ bool next_expects_preload =
+ i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
+ EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
+ alt_gen, preload.preload_characters_,
+ next_expects_preload);
+ }
+}
+
+Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
+ AlternativeGenerationList* alt_gens,
+ PreloadState* preload,
+ GreedyLoopState* greedy_loop_state,
+ int text_length) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ // Here we have special handling for greedy loops containing only text nodes
+ // and other simple nodes. These are handled by pushing the current
+ // position on the stack and then incrementing the current position each
+ // time around the switch. On backtrack we decrement the current position
+ // and check it against the pushed value. This avoids pushing backtrack
+ // information for each iteration of the loop, which could take up a lot of
+ // space.
+ DCHECK(trace->stop_node() == nullptr);
+ macro_assembler->PushCurrentPosition();
+ Label greedy_match_failed;
+ Trace greedy_match_trace;
+ if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
+ greedy_match_trace.set_backtrack(&greedy_match_failed);
+ Label loop_label;
+ macro_assembler->Bind(&loop_label);
+ greedy_match_trace.set_stop_node(this);
+ greedy_match_trace.set_loop_label(&loop_label);
+ alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
+ macro_assembler->Bind(&greedy_match_failed);
+
+ Label second_choice; // For use in greedy matches.
+ macro_assembler->Bind(&second_choice);
+
+ Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
+
+ EmitChoices(compiler, alt_gens, 1, new_trace, preload);
+
+ macro_assembler->Bind(greedy_loop_state->label());
+ // If we have unwound to the bottom then backtrack.
+ macro_assembler->CheckGreedyLoop(trace->backtrack());
+ // Otherwise try the second priority at an earlier position.
+ macro_assembler->AdvanceCurrentPosition(-text_length);
+ macro_assembler->GoTo(&second_choice);
+ return new_trace;
+}
+
+int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
+ Trace* trace) {
+ int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
+ if (alternatives_->length() != 2) return eats_at_least;
+
+ GuardedAlternative alt1 = alternatives_->at(1);
+ if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
+ return eats_at_least;
+ }
+ RegExpNode* eats_anything_node = alt1.node();
+ if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
+ return eats_at_least;
+ }
+
+ // Really we should be creating a new trace when we execute this function,
+ // but there is no need, because the code it generates cannot backtrack, and
+ // we always arrive here with a trivial trace (since it's the entry to a
+ // loop. That also implies that there are no preloaded characters, which is
+ // good, because it means we won't be violating any assumptions by
+ // overwriting those characters with new load instructions.
+ DCHECK(trace->is_trivial());
+
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ Isolate* isolate = macro_assembler->isolate();
+ // At this point we know that we are at a non-greedy loop that will eat
+ // any character one at a time. Any non-anchored regexp has such a
+ // loop prepended to it in order to find where it starts. We look for
+ // a pattern of the form ...abc... where we can look 6 characters ahead
+ // and step forwards 3 if the character is not one of abc. Abc need
+ // not be atoms, they can be any reasonably limited character class or
+ // small alternation.
+ BoyerMooreLookahead* bm = bm_info(false);
+ if (bm == nullptr) {
+ eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
+ if (eats_at_least >= 1) {
+ bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
+ GuardedAlternative alt0 = alternatives_->at(0);
+ alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
+ }
+ }
+ if (bm != nullptr) {
+ bm->EmitSkipInstructions(macro_assembler);
+ }
+ return eats_at_least;
+}
+
+void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
+ AlternativeGenerationList* alt_gens,
+ int first_choice, Trace* trace,
+ PreloadState* preload) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ SetUpPreLoad(compiler, trace, preload);
+
+ // For now we just call all choices one after the other. The idea ultimately
+ // is to use the Dispatch table to try only the relevant ones.
+ int choice_count = alternatives_->length();
+
+ int new_flush_budget = trace->flush_budget() / choice_count;
+
+ for (int i = first_choice; i < choice_count; i++) {
+ bool is_last = i == choice_count - 1;
+ bool fall_through_on_failure = !is_last;
+ GuardedAlternative alternative = alternatives_->at(i);
+ AlternativeGeneration* alt_gen = alt_gens->at(i);
+ alt_gen->quick_check_details.set_characters(preload->preload_characters_);
+ ZoneList<Guard*>* guards = alternative.guards();
+ int guard_count = (guards == nullptr) ? 0 : guards->length();
+ Trace new_trace(*trace);
+ new_trace.set_characters_preloaded(
+ preload->preload_is_current_ ? preload->preload_characters_ : 0);
+ if (preload->preload_has_checked_bounds_) {
+ new_trace.set_bound_checked_up_to(preload->preload_characters_);
+ }
+ new_trace.quick_check_performed()->Clear();
+ if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
+ if (!is_last) {
+ new_trace.set_backtrack(&alt_gen->after);
+ }
+ alt_gen->expects_preload = preload->preload_is_current_;
+ bool generate_full_check_inline = false;
+ if (compiler->optimize() &&
+ try_to_emit_quick_check_for_alternative(i == 0) &&
+ alternative.node()->EmitQuickCheck(
+ compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
+ &alt_gen->possible_success, &alt_gen->quick_check_details,
+ fall_through_on_failure, this)) {
+ // Quick check was generated for this choice.
+ preload->preload_is_current_ = true;
+ preload->preload_has_checked_bounds_ = true;
+ // If we generated the quick check to fall through on possible success,
+ // we now need to generate the full check inline.
+ if (!fall_through_on_failure) {
+ macro_assembler->Bind(&alt_gen->possible_success);
+ new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
+ new_trace.set_characters_preloaded(preload->preload_characters_);
+ new_trace.set_bound_checked_up_to(preload->preload_characters_);
+ generate_full_check_inline = true;
+ }
+ } else if (alt_gen->quick_check_details.cannot_match()) {
+ if (!fall_through_on_failure) {
+ macro_assembler->GoTo(trace->backtrack());
+ }
+ continue;
+ } else {
+ // No quick check was generated. Put the full code here.
+ // If this is not the first choice then there could be slow checks from
+ // previous cases that go here when they fail. There's no reason to
+ // insist that they preload characters since the slow check we are about
+ // to generate probably can't use it.
+ if (i != first_choice) {
+ alt_gen->expects_preload = false;
+ new_trace.InvalidateCurrentCharacter();
+ }
+ generate_full_check_inline = true;
+ }
+ if (generate_full_check_inline) {
+ if (new_trace.actions() != nullptr) {
+ new_trace.set_flush_budget(new_flush_budget);
+ }
+ for (int j = 0; j < guard_count; j++) {
+ GenerateGuard(macro_assembler, guards->at(j), &new_trace);
+ }
+ alternative.node()->Emit(compiler, &new_trace);
+ preload->preload_is_current_ = false;
+ }
+ macro_assembler->Bind(&alt_gen->after);
+ }
+}
+
+void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
+ Trace* trace,
+ GuardedAlternative alternative,
+ AlternativeGeneration* alt_gen,
+ int preload_characters,
+ bool next_expects_preload) {
+ if (!alt_gen->possible_success.is_linked()) return;
+
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ macro_assembler->Bind(&alt_gen->possible_success);
+ Trace out_of_line_trace(*trace);
+ out_of_line_trace.set_characters_preloaded(preload_characters);
+ out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
+ if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
+ ZoneList<Guard*>* guards = alternative.guards();
+ int guard_count = (guards == nullptr) ? 0 : guards->length();
+ if (next_expects_preload) {
+ Label reload_current_char;
+ out_of_line_trace.set_backtrack(&reload_current_char);
+ for (int j = 0; j < guard_count; j++) {
+ GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
+ }
+ alternative.node()->Emit(compiler, &out_of_line_trace);
+ macro_assembler->Bind(&reload_current_char);
+ // Reload the current character, since the next quick check expects that.
+ // We don't need to check bounds here because we only get into this
+ // code through a quick check which already did the checked load.
+ macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
+ preload_characters);
+ macro_assembler->GoTo(&(alt_gen->after));
+ } else {
+ out_of_line_trace.set_backtrack(&(alt_gen->after));
+ for (int j = 0; j < guard_count; j++) {
+ GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
+ }
+ alternative.node()->Emit(compiler, &out_of_line_trace);
+ }
+}
+
+void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ LimitResult limit_result = LimitVersions(compiler, trace);
+ if (limit_result == DONE) return;
+ DCHECK(limit_result == CONTINUE);
+
+ RecursionCheck rc(compiler);
+
+ switch (action_type_) {
+ case STORE_POSITION: {
+ Trace::DeferredCapture new_capture(data_.u_position_register.reg,
+ data_.u_position_register.is_capture,
+ trace);
+ Trace new_trace = *trace;
+ new_trace.add_action(&new_capture);
+ on_success()->Emit(compiler, &new_trace);
+ break;
+ }
+ case INCREMENT_REGISTER: {
+ Trace::DeferredIncrementRegister new_increment(
+ data_.u_increment_register.reg);
+ Trace new_trace = *trace;
+ new_trace.add_action(&new_increment);
+ on_success()->Emit(compiler, &new_trace);
+ break;
+ }
+ case SET_REGISTER_FOR_LOOP: {
+ Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
+ data_.u_store_register.value);
+ Trace new_trace = *trace;
+ new_trace.add_action(&new_set);
+ on_success()->Emit(compiler, &new_trace);
+ break;
+ }
+ case CLEAR_CAPTURES: {
+ Trace::DeferredClearCaptures new_capture(Interval(
+ data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
+ Trace new_trace = *trace;
+ new_trace.add_action(&new_capture);
+ on_success()->Emit(compiler, &new_trace);
+ break;
+ }
+ case BEGIN_POSITIVE_SUBMATCH:
+ case BEGIN_NEGATIVE_SUBMATCH:
+ if (!trace->is_trivial()) {
+ trace->Flush(compiler, this);
+ } else {
+ assembler->WriteCurrentPositionToRegister(
+ data_.u_submatch.current_position_register, 0);
+ assembler->WriteStackPointerToRegister(
+ data_.u_submatch.stack_pointer_register);
+ on_success()->Emit(compiler, trace);
+ }
+ break;
+ case EMPTY_MATCH_CHECK: {
+ int start_pos_reg = data_.u_empty_match_check.start_register;
+ int stored_pos = 0;
+ int rep_reg = data_.u_empty_match_check.repetition_register;
+ bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
+ bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
+ if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
+ // If we know we haven't advanced and there is no minimum we
+ // can just backtrack immediately.
+ assembler->GoTo(trace->backtrack());
+ } else if (know_dist && stored_pos < trace->cp_offset()) {
+ // If we know we've advanced we can generate the continuation
+ // immediately.
+ on_success()->Emit(compiler, trace);
+ } else if (!trace->is_trivial()) {
+ trace->Flush(compiler, this);
+ } else {
+ Label skip_empty_check;
+ // If we have a minimum number of repetitions we check the current
+ // number first and skip the empty check if it's not enough.
+ if (has_minimum) {
+ int limit = data_.u_empty_match_check.repetition_limit;
+ assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
+ }
+ // If the match is empty we bail out, otherwise we fall through
+ // to the on-success continuation.
+ assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
+ trace->backtrack());
+ assembler->Bind(&skip_empty_check);
+ on_success()->Emit(compiler, trace);
+ }
+ break;
+ }
+ case POSITIVE_SUBMATCH_SUCCESS: {
+ if (!trace->is_trivial()) {
+ trace->Flush(compiler, this);
+ return;
+ }
+ assembler->ReadCurrentPositionFromRegister(
+ data_.u_submatch.current_position_register);
+ assembler->ReadStackPointerFromRegister(
+ data_.u_submatch.stack_pointer_register);
+ int clear_register_count = data_.u_submatch.clear_register_count;
+ if (clear_register_count == 0) {
+ on_success()->Emit(compiler, trace);
+ return;
+ }
+ int clear_registers_from = data_.u_submatch.clear_register_from;
+ Label clear_registers_backtrack;
+ Trace new_trace = *trace;
+ new_trace.set_backtrack(&clear_registers_backtrack);
+ on_success()->Emit(compiler, &new_trace);
+
+ assembler->Bind(&clear_registers_backtrack);
+ int clear_registers_to = clear_registers_from + clear_register_count - 1;
+ assembler->ClearRegisters(clear_registers_from, clear_registers_to);
+
+ DCHECK(trace->backtrack() == nullptr);
+ assembler->Backtrack();
+ return;
+ }
+ default:
+ UNREACHABLE();
+ }
+}
+
+void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ if (!trace->is_trivial()) {
+ trace->Flush(compiler, this);
+ return;
+ }
+
+ LimitResult limit_result = LimitVersions(compiler, trace);
+ if (limit_result == DONE) return;
+ DCHECK(limit_result == CONTINUE);
+
+ RecursionCheck rc(compiler);
+
+ DCHECK_EQ(start_reg_ + 1, end_reg_);
+ if (IsIgnoreCase(flags_)) {
+ bool unicode = IsEitherUnicode(flags_);
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
+ unicode, trace->backtrack());
+ } else {
+ assembler->CheckNotBackReference(start_reg_, read_backward(),
+ trace->backtrack());
+ }
+ // We are going to advance backward, so we may end up at the start.
+ if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
+
+ // Check that the back reference does not end inside a surrogate pair.
+ if (IsEitherUnicode(flags_) && !compiler->one_byte()) {
+ assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
+ }
+ on_success()->Emit(compiler, trace);
+}
+
+void TextNode::CalculateOffsets() {
+ int element_count = elements()->length();
+ // Set up the offsets of the elements relative to the start. This is a fixed
+ // quantity since a TextNode can only contain fixed-width things.
+ int cp_offset = 0;
+ for (int i = 0; i < element_count; i++) {
+ TextElement& elm = elements()->at(i);
+ elm.set_cp_offset(cp_offset);
+ cp_offset += elm.length();
+ }
+}
+
+namespace {
+
+// Assertion propagation moves information about assertions such as
+// \b to the affected nodes. For instance, in /.\b./ information must
+// be propagated to the first '.' that whatever follows needs to know
+// if it matched a word or a non-word, and to the second '.' that it
+// has to check if it succeeds a word or non-word. In this case the
+// result will be something like:
+//
+// +-------+ +------------+
+// | . | | . |
+// +-------+ ---> +------------+
+// | word? | | check word |
+// +-------+ +------------+
+class AssertionPropagator : public AllStatic {
+ public:
+ static void VisitText(TextNode* that) {}
+
+ static void VisitAction(ActionNode* that) {
+ // If the next node is interested in what it follows then this node
+ // has to be interested too so it can pass the information on.
+ that->info()->AddFromFollowing(that->on_success()->info());
+ }
+
+ static void VisitChoice(ChoiceNode* that, int i) {
+ // Anything the following nodes need to know has to be known by
+ // this node also, so it can pass it on.
+ that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
+ }
+
+ static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
+ that->info()->AddFromFollowing(that->continue_node()->info());
+ }
+
+ static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
+ that->info()->AddFromFollowing(that->loop_node()->info());
+ }
+
+ static void VisitNegativeLookaroundChoiceLookaroundNode(
+ NegativeLookaroundChoiceNode* that) {
+ VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
+ }
+
+ static void VisitNegativeLookaroundChoiceContinueNode(
+ NegativeLookaroundChoiceNode* that) {
+ VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
+ }
+
+ static void VisitBackReference(BackReferenceNode* that) {}
+
+ static void VisitAssertion(AssertionNode* that) {}
+};
+
+// Propagates information about the minimum size of successful matches from
+// successor nodes to their predecessors. Note that all eats_at_least values
+// are initialized to zero before analysis.
+class EatsAtLeastPropagator : public AllStatic {
+ public:
+ static void VisitText(TextNode* that) {
+ // The eats_at_least value is not used if reading backward.
+ if (!that->read_backward()) {
+ // We are not at the start after this node, and thus we can use the
+ // successor's eats_at_least_from_not_start value.
+ uint8_t eats_at_least = base::saturated_cast<uint8_t>(
+ that->Length() + that->on_success()
+ ->eats_at_least_info()
+ ->eats_at_least_from_not_start);
+ that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
+ }
+ }
+
+ static void VisitAction(ActionNode* that) {
+ switch (that->action_type()) {
+ case ActionNode::BEGIN_POSITIVE_SUBMATCH:
+ case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
+ // We do not propagate eats_at_least data through positive lookarounds,
+ // because they rewind input.
+ // TODO(v8:11859) Potential approaches for fixing this include:
+ // 1. Add a dedicated choice node for positive lookaround, similar to
+ // NegativeLookaroundChoiceNode.
+ // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which
+ // is <= eats_at_least_from_possibly_start, and use that value in
+ // EatsAtLeastFromLoopEntry.
+ DCHECK(that->eats_at_least_info()->IsZero());
+ break;
+ case ActionNode::SET_REGISTER_FOR_LOOP:
+ // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
+ // loop body will run at least the minimum number of times before the
+ // continuation case can run.
+ that->set_eats_at_least_info(
+ that->on_success()->EatsAtLeastFromLoopEntry());
+ break;
+ case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
+ default:
+ // Otherwise, the current node eats at least as much as its successor.
+ // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
+ // because NegativeLookaroundChoiceNode ignores its lookaround successor
+ // when computing eats-at-least and quick check information.
+ that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
+ break;
+ }
+ }
+
+ static void VisitChoice(ChoiceNode* that, int i) {
+ // The minimum possible match from a choice node is the minimum of its
+ // successors.
+ EatsAtLeastInfo eats_at_least =
+ i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
+ eats_at_least.SetMin(
+ *that->alternatives()->at(i).node()->eats_at_least_info());
+ that->set_eats_at_least_info(eats_at_least);
+ }
+
+ static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
+ if (!that->read_backward()) {
+ that->set_eats_at_least_info(
+ *that->continue_node()->eats_at_least_info());
+ }
+ }
+
+ static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
+
+ static void VisitNegativeLookaroundChoiceLookaroundNode(
+ NegativeLookaroundChoiceNode* that) {}
+
+ static void VisitNegativeLookaroundChoiceContinueNode(
+ NegativeLookaroundChoiceNode* that) {
+ that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
+ }
+
+ static void VisitBackReference(BackReferenceNode* that) {
+ if (!that->read_backward()) {
+ that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
+ }
+ }
+
+ static void VisitAssertion(AssertionNode* that) {
+ EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
+ if (that->assertion_type() == AssertionNode::AT_START) {
+ // If we know we are not at the start and we are asked "how many
+ // characters will you match if you succeed?" then we can answer anything
+ // since false implies false. So let's just set the max answer
+ // (UINT8_MAX) since that won't prevent us from preloading a lot of
+ // characters for the other branches in the node graph.
+ eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
+ }
+ that->set_eats_at_least_info(eats_at_least);
+ }
+};
+
+} // namespace
+
+// -------------------------------------------------------------------
+// Analysis
+
+// Iterates the node graph and provides the opportunity for propagators to set
+// values that depend on successor nodes.
+template <typename... Propagators>
+class Analysis : public NodeVisitor {
+ public:
+ Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
+ : isolate_(isolate),
+ is_one_byte_(is_one_byte),
+ flags_(flags),
+ error_(RegExpError::kNone) {}
+
+ void EnsureAnalyzed(RegExpNode* that) {
+ StackLimitCheck check(isolate());
+ if (check.HasOverflowed()) {
+ if (v8_flags.correctness_fuzzer_suppressions) {
+ FATAL("Analysis: Aborting on stack overflow");
+ }
+ fail(RegExpError::kAnalysisStackOverflow);
+ return;
+ }
+ if (that->info()->been_analyzed || that->info()->being_analyzed) return;
+ that->info()->being_analyzed = true;
+ that->Accept(this);
+ that->info()->being_analyzed = false;
+ that->info()->been_analyzed = true;
+ }
+
+ bool has_failed() { return error_ != RegExpError::kNone; }
+ RegExpError error() {
+ DCHECK(error_ != RegExpError::kNone);
+ return error_;
+ }
+ void fail(RegExpError error) { error_ = error; }
+
+ Isolate* isolate() const { return isolate_; }
+
+ void VisitEnd(EndNode* that) override {
+ // nothing to do
+ }
+
+// Used to call the given static function on each propagator / variadic template
+// argument.
+#define STATIC_FOR_EACH(expr) \
+ do { \
+ int dummy[] = {((expr), 0)...}; \
+ USE(dummy); \
+ } while (false)
+
+ void VisitText(TextNode* that) override {
+ that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
+ EnsureAnalyzed(that->on_success());
+ if (has_failed()) return;
+ that->CalculateOffsets();
+ STATIC_FOR_EACH(Propagators::VisitText(that));
+ }
+
+ void VisitAction(ActionNode* that) override {
+ EnsureAnalyzed(that->on_success());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(Propagators::VisitAction(that));
+ }
+
+ void VisitChoice(ChoiceNode* that) override {
+ for (int i = 0; i < that->alternatives()->length(); i++) {
+ EnsureAnalyzed(that->alternatives()->at(i).node());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
+ }
+ }
+
+ void VisitLoopChoice(LoopChoiceNode* that) override {
+ DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue.
+
+ // First propagate all information from the continuation node.
+ EnsureAnalyzed(that->continue_node());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
+
+ // Check the loop last since it may need the value of this node
+ // to get a correct result.
+ EnsureAnalyzed(that->loop_node());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
+ }
+
+ void VisitNegativeLookaroundChoice(
+ NegativeLookaroundChoiceNode* that) override {
+ DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue.
+
+ EnsureAnalyzed(that->lookaround_node());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(
+ Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
+
+ EnsureAnalyzed(that->continue_node());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(
+ Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
+ }
+
+ void VisitBackReference(BackReferenceNode* that) override {
+ EnsureAnalyzed(that->on_success());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(Propagators::VisitBackReference(that));
+ }
+
+ void VisitAssertion(AssertionNode* that) override {
+ EnsureAnalyzed(that->on_success());
+ if (has_failed()) return;
+ STATIC_FOR_EACH(Propagators::VisitAssertion(that));
+ }
+
+#undef STATIC_FOR_EACH
+
+ private:
+ Isolate* isolate_;
+ const bool is_one_byte_;
+ const RegExpFlags flags_;
+ RegExpError error_;
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
+};
+
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
+ RegExpNode* node) {
+ Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
+ isolate, is_one_byte, flags);
+ DCHECK_EQ(node->info()->been_analyzed, false);
+ analysis.EnsureAnalyzed(node);
+ DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
+ return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
+}
+
+void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start) {
+ // Working out the set of characters that a backreference can match is too
+ // hard, so we just say that any character can match.
+ bm->SetRest(offset);
+ SaveBMInfo(bm, not_at_start, offset);
+}
+
+static_assert(BoyerMoorePositionInfo::kMapSize ==
+ RegExpMacroAssembler::kTableSize);
+
+void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) {
+ ZoneList<GuardedAlternative>* alts = alternatives();
+ budget = (budget - 1) / alts->length();
+ for (int i = 0; i < alts->length(); i++) {
+ GuardedAlternative& alt = alts->at(i);
+ if (alt.guards() != nullptr && alt.guards()->length() != 0) {
+ bm->SetRest(offset); // Give up trying to fill in info.
+ SaveBMInfo(bm, not_at_start, offset);
+ return;
+ }
+ alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
+ }
+ SaveBMInfo(bm, not_at_start, offset);
+}
+
+void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) {
+ if (initial_offset >= bm->length()) return;
+ int offset = initial_offset;
+ int max_char = bm->max_char();
+ for (int i = 0; i < elements()->length(); i++) {
+ if (offset >= bm->length()) {
+ if (initial_offset == 0) set_bm_info(not_at_start, bm);
+ return;
+ }
+ TextElement text = elements()->at(i);
+ if (text.text_type() == TextElement::ATOM) {
+ RegExpAtom* atom = text.atom();
+ for (int j = 0; j < atom->length(); j++, offset++) {
+ if (offset >= bm->length()) {
+ if (initial_offset == 0) set_bm_info(not_at_start, bm);
+ return;
+ }
+ base::uc16 character = atom->data()[j];
+ if (IsIgnoreCase(bm->compiler()->flags())) {
+ unibrow::uchar chars[4];
+ int length = GetCaseIndependentLetters(
+ isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
+ chars, 4);
+ for (int k = 0; k < length; k++) {
+ bm->Set(offset, chars[k]);
+ }
+ } else {
+ if (character <= max_char) bm->Set(offset, character);
+ }
+ }
+ } else {
+ DCHECK_EQ(TextElement::CLASS_RANGES, text.text_type());
+ RegExpClassRanges* class_ranges = text.class_ranges();
+ ZoneList<CharacterRange>* ranges = class_ranges->ranges(zone());
+ if (class_ranges->is_negated()) {
+ bm->SetAll(offset);
+ } else {
+ for (int k = 0; k < ranges->length(); k++) {
+ CharacterRange& range = ranges->at(k);
+ if (static_cast<int>(range.from()) > max_char) continue;
+ int to = std::min(max_char, static_cast<int>(range.to()));
+ bm->SetInterval(offset, Interval(range.from(), to));
+ }
+ }
+ offset++;
+ }
+ }
+ if (offset >= bm->length()) {
+ if (initial_offset == 0) set_bm_info(not_at_start, bm);
+ return;
+ }
+ on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
+ true); // Not at start after a text node.
+ if (initial_offset == 0) set_bm_info(not_at_start, bm);
+}
+
+RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
+ RegExpNode* on_success) {
+ DCHECK(!read_backward());
+ ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
+ zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
+ ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
+ zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
+
+ ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
+
+ int stack_register = UnicodeLookaroundStackRegister();
+ int position_register = UnicodeLookaroundPositionRegister();
+ RegExpNode* step_back = TextNode::CreateForCharacterRanges(
+ zone(), lead_surrogates, true, on_success);
+ RegExpLookaround::Builder builder(true, step_back, stack_register,
+ position_register);
+ RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
+ zone(), trail_surrogates, false, builder.on_match_success());
+
+ optional_step_back->AddAlternative(
+ GuardedAlternative(builder.ForMatch(match_trail)));
+ optional_step_back->AddAlternative(GuardedAlternative(on_success));
+
+ return optional_step_back;
+}
+
+RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
+ RegExpFlags flags,
+ bool is_one_byte) {
+ // Wrap the body of the regexp in capture #0.
+ RegExpNode* captured_body =
+ RegExpCapture::ToNode(data->tree, 0, this, accept());
+ RegExpNode* node = captured_body;
+ if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
+ // Add a .*? at the beginning, outside the body capture, unless
+ // this expression is anchored at the beginning or sticky.
+ RegExpNode* loop_node = RegExpQuantifier::ToNode(
+ 0, RegExpTree::kInfinity, false,
+ zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything), this,
+ captured_body, data->contains_anchor);
+
+ if (data->contains_anchor) {
+ // Unroll loop once, to take care of the case that might start
+ // at the start of input.
+ ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
+ first_step_node->AddAlternative(GuardedAlternative(captured_body));
+ first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
+ zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything),
+ false, loop_node)));
+ node = first_step_node;
+ } else {
+ node = loop_node;
+ }
+ }
+ if (is_one_byte) {
+ node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
+ // Do it again to propagate the new nodes to places where they were not
+ // put because they had not been calculated yet.
+ if (node != nullptr) {
+ node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
+ }
+ } else if (IsEitherUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
+ node = OptionallyStepBackToLeadSurrogate(node);
+ }
+
+ if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
+ return node;
+}
+
+void RegExpCompiler::ToNodeCheckForStackOverflow() {
+ if (StackLimitCheck{isolate()}.HasOverflowed()) {
+ V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
+ }
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-compiler.h b/js/src/irregexp/imported/regexp-compiler.h
new file mode 100644
index 0000000000..91dd43ab8a
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-compiler.h
@@ -0,0 +1,621 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_COMPILER_H_
+#define V8_REGEXP_REGEXP_COMPILER_H_
+
+#include <bitset>
+
+#include "irregexp/imported/regexp-nodes.h"
+
+namespace v8 {
+namespace internal {
+
+class DynamicBitSet;
+class Isolate;
+
+namespace regexp_compiler_constants {
+
+// The '2' variant is has inclusive from and exclusive to.
+// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
+// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
+constexpr base::uc32 kRangeEndMarker = 0x110000;
+constexpr int kSpaceRanges[] = {
+ '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
+ 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
+ 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
+constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
+
+constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
+ '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
+constexpr int kWordRangeCount = arraysize(kWordRanges);
+constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
+constexpr int kDigitRangeCount = arraysize(kDigitRanges);
+constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
+ kLeadSurrogateStart + 1, kRangeEndMarker};
+constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
+constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
+ 0x2028, 0x202A, kRangeEndMarker};
+constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
+
+// More makes code generation slower, less makes V8 benchmark score lower.
+constexpr int kMaxLookaheadForBoyerMoore = 8;
+// In a 3-character pattern you can maximally step forwards 3 characters
+// at a time, which is not always enough to pay for the extra logic.
+constexpr int kPatternTooShortForBoyerMoore = 2;
+
+} // namespace regexp_compiler_constants
+
+inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) {
+ // Both unicode (or unicode sets) and ignore_case flags are set. We need to
+ // use ICU to find the closure over case equivalents.
+ return IsEitherUnicode(flags) && IsIgnoreCase(flags);
+}
+
+// Details of a quick mask-compare check that can look ahead in the
+// input stream.
+class QuickCheckDetails {
+ public:
+ QuickCheckDetails()
+ : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
+ explicit QuickCheckDetails(int characters)
+ : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
+ bool Rationalize(bool one_byte);
+ // Merge in the information from another branch of an alternation.
+ void Merge(QuickCheckDetails* other, int from_index);
+ // Advance the current position by some amount.
+ void Advance(int by, bool one_byte);
+ void Clear();
+ bool cannot_match() { return cannot_match_; }
+ void set_cannot_match() { cannot_match_ = true; }
+ struct Position {
+ Position() : mask(0), value(0), determines_perfectly(false) {}
+ base::uc32 mask;
+ base::uc32 value;
+ bool determines_perfectly;
+ };
+ int characters() { return characters_; }
+ void set_characters(int characters) { characters_ = characters; }
+ Position* positions(int index) {
+ DCHECK_LE(0, index);
+ DCHECK_GT(characters_, index);
+ return positions_ + index;
+ }
+ uint32_t mask() { return mask_; }
+ uint32_t value() { return value_; }
+
+ private:
+ // How many characters do we have quick check information from. This is
+ // the same for all branches of a choice node.
+ int characters_;
+ Position positions_[4];
+ // These values are the condensate of the above array after Rationalize().
+ uint32_t mask_;
+ uint32_t value_;
+ // If set to true, there is no way this quick check can match at all.
+ // E.g., if it requires to be at the start of the input, and isn't.
+ bool cannot_match_;
+};
+
+// Improve the speed that we scan for an initial point where a non-anchored
+// regexp can match by using a Boyer-Moore-like table. This is done by
+// identifying non-greedy non-capturing loops in the nodes that eat any
+// character one at a time. For example in the middle of the regexp
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
+// inserted at the start of any non-anchored regexp.
+//
+// When we have found such a loop we look ahead in the nodes to find the set of
+// characters that can come at given distances. For example for the regexp
+// /.?foo/ we know that there are at least 3 characters ahead of us, and the
+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
+// the lookahead info where the set of characters is reasonably constrained. In
+// our example this is from index 1 to 2 (0 is not constrained). We can now
+// look 3 characters ahead and if we don't find one of [f, o] (the union of
+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
+//
+// For Unicode input strings we do the same, but modulo 128.
+//
+// We also look at the first string fed to the regexp and use that to get a hint
+// of the character frequencies in the inputs. This affects the assessment of
+// whether the set of characters is 'reasonably constrained'.
+//
+// We also have another lookahead mechanism (called quick check in the code),
+// which uses a wide load of multiple characters followed by a mask and compare
+// to determine whether a match is possible at this point.
+enum ContainedInLattice {
+ kNotYet = 0,
+ kLatticeIn = 1,
+ kLatticeOut = 2,
+ kLatticeUnknown = 3 // Can also mean both in and out.
+};
+
+inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
+ return static_cast<ContainedInLattice>(a | b);
+}
+
+class BoyerMoorePositionInfo : public ZoneObject {
+ public:
+ bool at(int i) const { return map_[i]; }
+
+ static constexpr int kMapSize = 128;
+ static constexpr int kMask = kMapSize - 1;
+
+ int map_count() const { return map_count_; }
+
+ void Set(int character);
+ void SetInterval(const Interval& interval);
+ void SetAll();
+
+ bool is_non_word() { return w_ == kLatticeOut; }
+ bool is_word() { return w_ == kLatticeIn; }
+
+ using Bitset = std::bitset<kMapSize>;
+ Bitset raw_bitset() const { return map_; }
+
+ private:
+ Bitset map_;
+ int map_count_ = 0; // Number of set bits in the map.
+ ContainedInLattice w_ = kNotYet; // The \w character class.
+};
+
+class BoyerMooreLookahead : public ZoneObject {
+ public:
+ BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
+
+ int length() { return length_; }
+ int max_char() { return max_char_; }
+ RegExpCompiler* compiler() { return compiler_; }
+
+ int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
+
+ BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
+
+ void Set(int map_number, int character) {
+ if (character > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
+ info->Set(character);
+ }
+
+ void SetInterval(int map_number, const Interval& interval) {
+ if (interval.from() > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
+ if (interval.to() > max_char_) {
+ info->SetInterval(Interval(interval.from(), max_char_));
+ } else {
+ info->SetInterval(interval);
+ }
+ }
+
+ void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
+
+ void SetRest(int from_map) {
+ for (int i = from_map; i < length_; i++) SetAll(i);
+ }
+ void EmitSkipInstructions(RegExpMacroAssembler* masm);
+
+ private:
+ // This is the value obtained by EatsAtLeast. If we do not have at least this
+ // many characters left in the sample string then the match is bound to fail.
+ // Therefore it is OK to read a character this far ahead of the current match
+ // point.
+ int length_;
+ RegExpCompiler* compiler_;
+ // 0xff for Latin1, 0xffff for UTF-16.
+ int max_char_;
+ ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
+
+ int GetSkipTable(int min_lookahead, int max_lookahead,
+ Handle<ByteArray> boolean_skip_table);
+ bool FindWorthwhileInterval(int* from, int* to);
+ int FindBestInterval(int max_number_of_chars, int old_biggest_points,
+ int* from, int* to);
+};
+
+// There are many ways to generate code for a node. This class encapsulates
+// the current way we should be generating. In other words it encapsulates
+// the current state of the code generator. The effect of this is that we
+// generate code for paths that the matcher can take through the regular
+// expression. A given node in the regexp can be code-generated several times
+// as it can be part of several traces. For example for the regexp:
+// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
+// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
+// to match foo is generated only once (the traces have a common prefix). The
+// code to store the capture is deferred and generated (twice) after the places
+// where baz has been matched.
+class Trace {
+ public:
+ // A value for a property that is either known to be true, know to be false,
+ // or not known.
+ enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
+
+ class DeferredAction {
+ public:
+ DeferredAction(ActionNode::ActionType action_type, int reg)
+ : action_type_(action_type), reg_(reg), next_(nullptr) {}
+ DeferredAction* next() { return next_; }
+ bool Mentions(int reg);
+ int reg() { return reg_; }
+ ActionNode::ActionType action_type() { return action_type_; }
+
+ private:
+ ActionNode::ActionType action_type_;
+ int reg_;
+ DeferredAction* next_;
+ friend class Trace;
+ };
+
+ class DeferredCapture : public DeferredAction {
+ public:
+ DeferredCapture(int reg, bool is_capture, Trace* trace)
+ : DeferredAction(ActionNode::STORE_POSITION, reg),
+ cp_offset_(trace->cp_offset()),
+ is_capture_(is_capture) {}
+ int cp_offset() { return cp_offset_; }
+ bool is_capture() { return is_capture_; }
+
+ private:
+ int cp_offset_;
+ bool is_capture_;
+ void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
+ };
+
+ class DeferredSetRegisterForLoop : public DeferredAction {
+ public:
+ DeferredSetRegisterForLoop(int reg, int value)
+ : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
+ value_(value) {}
+ int value() { return value_; }
+
+ private:
+ int value_;
+ };
+
+ class DeferredClearCaptures : public DeferredAction {
+ public:
+ explicit DeferredClearCaptures(Interval range)
+ : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
+ Interval range() { return range_; }
+
+ private:
+ Interval range_;
+ };
+
+ class DeferredIncrementRegister : public DeferredAction {
+ public:
+ explicit DeferredIncrementRegister(int reg)
+ : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
+ };
+
+ Trace()
+ : cp_offset_(0),
+ actions_(nullptr),
+ backtrack_(nullptr),
+ stop_node_(nullptr),
+ loop_label_(nullptr),
+ characters_preloaded_(0),
+ bound_checked_up_to_(0),
+ flush_budget_(100),
+ at_start_(UNKNOWN) {}
+
+ // End the trace. This involves flushing the deferred actions in the trace
+ // and pushing a backtrack location onto the backtrack stack. Once this is
+ // done we can start a new trace or go to one that has already been
+ // generated.
+ void Flush(RegExpCompiler* compiler, RegExpNode* successor);
+ int cp_offset() { return cp_offset_; }
+ DeferredAction* actions() { return actions_; }
+ // A trivial trace is one that has no deferred actions or other state that
+ // affects the assumptions used when generating code. There is no recorded
+ // backtrack location in a trivial trace, so with a trivial trace we will
+ // generate code that, on a failure to match, gets the backtrack location
+ // from the backtrack stack rather than using a direct jump instruction. We
+ // always start code generation with a trivial trace and non-trivial traces
+ // are created as we emit code for nodes or add to the list of deferred
+ // actions in the trace. The location of the code generated for a node using
+ // a trivial trace is recorded in a label in the node so that gotos can be
+ // generated to that code.
+ bool is_trivial() {
+ return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
+ characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
+ quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
+ }
+ TriBool at_start() { return at_start_; }
+ void set_at_start(TriBool at_start) { at_start_ = at_start; }
+ Label* backtrack() { return backtrack_; }
+ Label* loop_label() { return loop_label_; }
+ RegExpNode* stop_node() { return stop_node_; }
+ int characters_preloaded() { return characters_preloaded_; }
+ int bound_checked_up_to() { return bound_checked_up_to_; }
+ int flush_budget() { return flush_budget_; }
+ QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
+ bool mentions_reg(int reg);
+ // Returns true if a deferred position store exists to the specified
+ // register and stores the offset in the out-parameter. Otherwise
+ // returns false.
+ bool GetStoredPosition(int reg, int* cp_offset);
+ // These set methods and AdvanceCurrentPositionInTrace should be used only on
+ // new traces - the intention is that traces are immutable after creation.
+ void add_action(DeferredAction* new_action) {
+ DCHECK(new_action->next_ == nullptr);
+ new_action->next_ = actions_;
+ actions_ = new_action;
+ }
+ void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
+ void set_stop_node(RegExpNode* node) { stop_node_ = node; }
+ void set_loop_label(Label* label) { loop_label_ = label; }
+ void set_characters_preloaded(int count) { characters_preloaded_ = count; }
+ void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
+ void set_flush_budget(int to) { flush_budget_ = to; }
+ void set_quick_check_performed(QuickCheckDetails* d) {
+ quick_check_performed_ = *d;
+ }
+ void InvalidateCurrentCharacter();
+ void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
+
+ private:
+ int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
+ void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
+ const DynamicBitSet& affected_registers,
+ DynamicBitSet* registers_to_pop,
+ DynamicBitSet* registers_to_clear, Zone* zone);
+ void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
+ const DynamicBitSet& registers_to_pop,
+ const DynamicBitSet& registers_to_clear);
+ int cp_offset_;
+ DeferredAction* actions_;
+ Label* backtrack_;
+ RegExpNode* stop_node_;
+ Label* loop_label_;
+ int characters_preloaded_;
+ int bound_checked_up_to_;
+ QuickCheckDetails quick_check_performed_;
+ int flush_budget_;
+ TriBool at_start_;
+};
+
+class GreedyLoopState {
+ public:
+ explicit GreedyLoopState(bool not_at_start);
+
+ Label* label() { return &label_; }
+ Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
+
+ private:
+ Label label_;
+ Trace counter_backtrack_trace_;
+};
+
+struct PreloadState {
+ static const int kEatsAtLeastNotYetInitialized = -1;
+ bool preload_is_current_;
+ bool preload_has_checked_bounds_;
+ int preload_characters_;
+ int eats_at_least_;
+ void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
+};
+
+// Analysis performs assertion propagation and computes eats_at_least_ values.
+// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
+// details.
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
+ RegExpNode* node);
+
+class FrequencyCollator {
+ public:
+ FrequencyCollator() : total_samples_(0) {
+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+ frequencies_[i] = CharacterFrequency(i);
+ }
+ }
+
+ void CountCharacter(int character) {
+ int index = (character & RegExpMacroAssembler::kTableMask);
+ frequencies_[index].Increment();
+ total_samples_++;
+ }
+
+ // Does not measure in percent, but rather per-128 (the table size from the
+ // regexp macro assembler).
+ int Frequency(int in_character) {
+ DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
+ if (total_samples_ < 1) return 1; // Division by zero.
+ int freq_in_per128 =
+ (frequencies_[in_character].counter() * 128) / total_samples_;
+ return freq_in_per128;
+ }
+
+ private:
+ class CharacterFrequency {
+ public:
+ CharacterFrequency() : counter_(0), character_(-1) {}
+ explicit CharacterFrequency(int character)
+ : counter_(0), character_(character) {}
+
+ void Increment() { counter_++; }
+ int counter() { return counter_; }
+ int character() { return character_; }
+
+ private:
+ int counter_;
+ int character_;
+ };
+
+ private:
+ CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
+ int total_samples_;
+};
+
+class RegExpCompiler {
+ public:
+ RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
+ RegExpFlags flags, bool is_one_byte);
+
+ int AllocateRegister() {
+ if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
+ reg_exp_too_big_ = true;
+ return next_register_;
+ }
+ return next_register_++;
+ }
+
+ // Lookarounds to match lone surrogates for unicode character class matches
+ // are never nested. We can therefore reuse registers.
+ int UnicodeLookaroundStackRegister() {
+ if (unicode_lookaround_stack_register_ == kNoRegister) {
+ unicode_lookaround_stack_register_ = AllocateRegister();
+ }
+ return unicode_lookaround_stack_register_;
+ }
+
+ int UnicodeLookaroundPositionRegister() {
+ if (unicode_lookaround_position_register_ == kNoRegister) {
+ unicode_lookaround_position_register_ = AllocateRegister();
+ }
+ return unicode_lookaround_position_register_;
+ }
+
+ struct CompilationResult final {
+ explicit CompilationResult(RegExpError err) : error(err) {}
+ CompilationResult(Handle<Object> code, int registers)
+ : code(code), num_registers(registers) {}
+
+ static CompilationResult RegExpTooBig() {
+ return CompilationResult(RegExpError::kTooLarge);
+ }
+
+ bool Succeeded() const { return error == RegExpError::kNone; }
+
+ const RegExpError error = RegExpError::kNone;
+ Handle<Object> code;
+ int num_registers = 0;
+ };
+
+ CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
+ RegExpNode* start, int capture_count,
+ Handle<String> pattern);
+
+ // Preprocessing is the final step of node creation before analysis
+ // and assembly. It includes:
+ // - Wrapping the body of the regexp in capture 0.
+ // - Inserting the implicit .* before/after the regexp if necessary.
+ // - If the input is a one-byte string, filtering out nodes that can't match.
+ // - Fixing up regexp matches that start within a surrogate pair.
+ RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags,
+ bool is_one_byte);
+
+ // If the regexp matching starts within a surrogate pair, step back to the
+ // lead surrogate and start matching from there.
+ RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success);
+
+ inline void AddWork(RegExpNode* node) {
+ if (!node->on_work_list() && !node->label()->is_bound()) {
+ node->set_on_work_list(true);
+ work_list_->push_back(node);
+ }
+ }
+
+ static const int kImplementationOffset = 0;
+ static const int kNumberOfRegistersOffset = 0;
+ static const int kCodeOffset = 1;
+
+ RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
+ EndNode* accept() { return accept_; }
+
+ static const int kMaxRecursion = 100;
+ inline int recursion_depth() { return recursion_depth_; }
+ inline void IncrementRecursionDepth() { recursion_depth_++; }
+ inline void DecrementRecursionDepth() { recursion_depth_--; }
+
+ RegExpFlags flags() const { return flags_; }
+
+ void SetRegExpTooBig() { reg_exp_too_big_ = true; }
+
+ inline bool one_byte() { return one_byte_; }
+ inline bool optimize() { return optimize_; }
+ inline void set_optimize(bool value) { optimize_ = value; }
+ inline bool limiting_recursion() { return limiting_recursion_; }
+ inline void set_limiting_recursion(bool value) {
+ limiting_recursion_ = value;
+ }
+ bool read_backward() { return read_backward_; }
+ void set_read_backward(bool value) { read_backward_ = value; }
+ FrequencyCollator* frequency_collator() { return &frequency_collator_; }
+
+ int current_expansion_factor() { return current_expansion_factor_; }
+ void set_current_expansion_factor(int value) {
+ current_expansion_factor_ = value;
+ }
+
+ // The recursive nature of ToNode node generation means we may run into stack
+ // overflow issues. We introduce periodic checks to detect these, and the
+ // tick counter helps limit overhead of these checks.
+ // TODO(jgruber): This is super hacky and should be replaced by an abort
+ // mechanism or iterative node generation.
+ void ToNodeMaybeCheckForStackOverflow() {
+ if ((to_node_overflow_check_ticks_++ % 16 == 0)) {
+ ToNodeCheckForStackOverflow();
+ }
+ }
+ void ToNodeCheckForStackOverflow();
+
+ Isolate* isolate() const { return isolate_; }
+ Zone* zone() const { return zone_; }
+
+ static const int kNoRegister = -1;
+
+ private:
+ EndNode* accept_;
+ int next_register_;
+ int unicode_lookaround_stack_register_;
+ int unicode_lookaround_position_register_;
+ ZoneVector<RegExpNode*>* work_list_;
+ int recursion_depth_;
+ const RegExpFlags flags_;
+ RegExpMacroAssembler* macro_assembler_;
+ bool one_byte_;
+ bool reg_exp_too_big_;
+ bool limiting_recursion_;
+ int to_node_overflow_check_ticks_ = 0;
+ bool optimize_;
+ bool read_backward_;
+ int current_expansion_factor_;
+ FrequencyCollator frequency_collator_;
+ Isolate* isolate_;
+ Zone* zone_;
+};
+
+// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
+class UnicodeRangeSplitter {
+ public:
+ V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
+
+ static constexpr int kInitialSize = 8;
+ using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
+
+ const CharacterRangeVector* bmp() const { return &bmp_; }
+ const CharacterRangeVector* lead_surrogates() const {
+ return &lead_surrogates_;
+ }
+ const CharacterRangeVector* trail_surrogates() const {
+ return &trail_surrogates_;
+ }
+ const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
+
+ private:
+ void AddRange(CharacterRange range);
+
+ CharacterRangeVector bmp_;
+ CharacterRangeVector lead_surrogates_;
+ CharacterRangeVector trail_surrogates_;
+ CharacterRangeVector non_bmp_;
+};
+
+// We need to check for the following characters: 0x39C 0x3BC 0x178.
+// TODO(jgruber): Move to CharacterRange.
+bool RangeContainsLatin1Equivalents(CharacterRange range);
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_COMPILER_H_
diff --git a/js/src/irregexp/imported/regexp-dotprinter.cc b/js/src/irregexp/imported/regexp-dotprinter.cc
new file mode 100644
index 0000000000..6746992a0a
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-dotprinter.cc
@@ -0,0 +1,249 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-dotprinter.h"
+
+#include "irregexp/imported/regexp-compiler.h"
+
+namespace v8 {
+namespace internal {
+
+// -------------------------------------------------------------------
+// Dot/dotty output
+
+class DotPrinterImpl : public NodeVisitor {
+ public:
+ explicit DotPrinterImpl(std::ostream& os) : os_(os) {}
+ void PrintNode(const char* label, RegExpNode* node);
+ void Visit(RegExpNode* node);
+ void PrintAttributes(RegExpNode* from);
+ void PrintOnFailure(RegExpNode* from, RegExpNode* to);
+#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
+ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
+#undef DECLARE_VISIT
+ private:
+ std::ostream& os_;
+};
+
+void DotPrinterImpl::PrintNode(const char* label, RegExpNode* node) {
+ os_ << "digraph G {\n graph [label=\"";
+ for (int i = 0; label[i]; i++) {
+ switch (label[i]) {
+ case '\\':
+ os_ << "\\\\";
+ break;
+ case '"':
+ os_ << "\"";
+ break;
+ default:
+ os_ << label[i];
+ break;
+ }
+ }
+ os_ << "\"];\n";
+ Visit(node);
+ os_ << "}" << std::endl;
+}
+
+void DotPrinterImpl::Visit(RegExpNode* node) {
+ if (node->info()->visited) return;
+ node->info()->visited = true;
+ node->Accept(this);
+}
+
+void DotPrinterImpl::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
+ os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
+ Visit(on_failure);
+}
+
+class AttributePrinter {
+ public:
+ explicit AttributePrinter(std::ostream& os) : os_(os), first_(true) {}
+ void PrintSeparator() {
+ if (first_) {
+ first_ = false;
+ } else {
+ os_ << "|";
+ }
+ }
+ void PrintBit(const char* name, bool value) {
+ if (!value) return;
+ PrintSeparator();
+ os_ << "{" << name << "}";
+ }
+ void PrintPositive(const char* name, int value) {
+ if (value < 0) return;
+ PrintSeparator();
+ os_ << "{" << name << "|" << value << "}";
+ }
+
+ private:
+ std::ostream& os_;
+ bool first_;
+};
+
+void DotPrinterImpl::PrintAttributes(RegExpNode* that) {
+ os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
+ << "margin=0.1, fontsize=10, label=\"{";
+ AttributePrinter printer(os_);
+ NodeInfo* info = that->info();
+ printer.PrintBit("NI", info->follows_newline_interest);
+ printer.PrintBit("WI", info->follows_word_interest);
+ printer.PrintBit("SI", info->follows_start_interest);
+ Label* label = that->label();
+ if (label->is_bound()) printer.PrintPositive("@", label->pos());
+ os_ << "}\"];\n"
+ << " a" << that << " -> n" << that
+ << " [style=dashed, color=grey, arrowhead=none];\n";
+}
+
+void DotPrinterImpl::VisitChoice(ChoiceNode* that) {
+ os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
+ for (int i = 0; i < that->alternatives()->length(); i++) {
+ GuardedAlternative alt = that->alternatives()->at(i);
+ os_ << " n" << that << " -> n" << alt.node();
+ }
+ for (int i = 0; i < that->alternatives()->length(); i++) {
+ GuardedAlternative alt = that->alternatives()->at(i);
+ alt.node()->Accept(this);
+ }
+}
+
+void DotPrinterImpl::VisitLoopChoice(LoopChoiceNode* that) {
+ VisitChoice(that);
+}
+
+void DotPrinterImpl::VisitNegativeLookaroundChoice(
+ NegativeLookaroundChoiceNode* that) {
+ VisitChoice(that);
+}
+
+void DotPrinterImpl::VisitText(TextNode* that) {
+ Zone* zone = that->zone();
+ os_ << " n" << that << " [label=\"";
+ for (int i = 0; i < that->elements()->length(); i++) {
+ if (i > 0) os_ << " ";
+ TextElement elm = that->elements()->at(i);
+ switch (elm.text_type()) {
+ case TextElement::ATOM: {
+ base::Vector<const base::uc16> data = elm.atom()->data();
+ for (int j = 0; j < data.length(); j++) {
+ os_ << static_cast<char>(data[j]);
+ }
+ break;
+ }
+ case TextElement::CLASS_RANGES: {
+ RegExpClassRanges* node = elm.class_ranges();
+ os_ << "[";
+ if (node->is_negated()) os_ << "^";
+ for (int j = 0; j < node->ranges(zone)->length(); j++) {
+ CharacterRange range = node->ranges(zone)->at(j);
+ os_ << AsUC32(range.from()) << "-" << AsUC32(range.to());
+ }
+ os_ << "]";
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+ }
+ os_ << "\", shape=box, peripheries=2];\n";
+ PrintAttributes(that);
+ os_ << " n" << that << " -> n" << that->on_success() << ";\n";
+ Visit(that->on_success());
+}
+
+void DotPrinterImpl::VisitBackReference(BackReferenceNode* that) {
+ os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
+ << that->end_register() << "\", shape=doubleoctagon];\n";
+ PrintAttributes(that);
+ os_ << " n" << that << " -> n" << that->on_success() << ";\n";
+ Visit(that->on_success());
+}
+
+void DotPrinterImpl::VisitEnd(EndNode* that) {
+ os_ << " n" << that << " [style=bold, shape=point];\n";
+ PrintAttributes(that);
+}
+
+void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
+ os_ << " n" << that << " [";
+ switch (that->assertion_type()) {
+ case AssertionNode::AT_END:
+ os_ << "label=\"$\", shape=septagon";
+ break;
+ case AssertionNode::AT_START:
+ os_ << "label=\"^\", shape=septagon";
+ break;
+ case AssertionNode::AT_BOUNDARY:
+ os_ << "label=\"\\b\", shape=septagon";
+ break;
+ case AssertionNode::AT_NON_BOUNDARY:
+ os_ << "label=\"\\B\", shape=septagon";
+ break;
+ case AssertionNode::AFTER_NEWLINE:
+ os_ << "label=\"(?<=\\n)\", shape=septagon";
+ break;
+ }
+ os_ << "];\n";
+ PrintAttributes(that);
+ RegExpNode* successor = that->on_success();
+ os_ << " n" << that << " -> n" << successor << ";\n";
+ Visit(successor);
+}
+
+void DotPrinterImpl::VisitAction(ActionNode* that) {
+ os_ << " n" << that << " [";
+ switch (that->action_type_) {
+ case ActionNode::SET_REGISTER_FOR_LOOP:
+ os_ << "label=\"$" << that->data_.u_store_register.reg
+ << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
+ break;
+ case ActionNode::INCREMENT_REGISTER:
+ os_ << "label=\"$" << that->data_.u_increment_register.reg
+ << "++\", shape=octagon";
+ break;
+ case ActionNode::STORE_POSITION:
+ os_ << "label=\"$" << that->data_.u_position_register.reg
+ << ":=$pos\", shape=octagon";
+ break;
+ case ActionNode::BEGIN_POSITIVE_SUBMATCH:
+ os_ << "label=\"$" << that->data_.u_submatch.current_position_register
+ << ":=$pos,begin-positive\", shape=septagon";
+ break;
+ case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
+ os_ << "label=\"$" << that->data_.u_submatch.current_position_register
+ << ":=$pos,begin-negative\", shape=septagon";
+ break;
+ case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
+ os_ << "label=\"escape\", shape=septagon";
+ break;
+ case ActionNode::EMPTY_MATCH_CHECK:
+ os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
+ << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
+ << "<" << that->data_.u_empty_match_check.repetition_limit
+ << "?\", shape=septagon";
+ break;
+ case ActionNode::CLEAR_CAPTURES: {
+ os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
+ << " to $" << that->data_.u_clear_captures.range_to
+ << "\", shape=septagon";
+ break;
+ }
+ }
+ os_ << "];\n";
+ PrintAttributes(that);
+ RegExpNode* successor = that->on_success();
+ os_ << " n" << that << " -> n" << successor << ";\n";
+ Visit(successor);
+}
+
+void DotPrinter::DotPrint(const char* label, RegExpNode* node) {
+ StdoutStream os;
+ DotPrinterImpl printer(os);
+ printer.PrintNode(label, node);
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-dotprinter.h b/js/src/irregexp/imported/regexp-dotprinter.h
new file mode 100644
index 0000000000..7fcece6e1a
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-dotprinter.h
@@ -0,0 +1,23 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_DOTPRINTER_H_
+#define V8_REGEXP_REGEXP_DOTPRINTER_H_
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class RegExpNode;
+
+class DotPrinter final : public AllStatic {
+ public:
+ static void DotPrint(const char* label, RegExpNode* node);
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_DOTPRINTER_H_
diff --git a/js/src/irregexp/imported/regexp-error.cc b/js/src/irregexp/imported/regexp-error.cc
new file mode 100644
index 0000000000..d0b4c263a4
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-error.cc
@@ -0,0 +1,22 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-error.h"
+
+namespace v8 {
+namespace internal {
+
+const char* const kRegExpErrorStrings[] = {
+#define TEMPLATE(NAME, STRING) STRING,
+ REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+};
+
+const char* RegExpErrorString(RegExpError error) {
+ DCHECK_LT(error, RegExpError::NumErrors);
+ return kRegExpErrorStrings[static_cast<int>(error)];
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-error.h b/js/src/irregexp/imported/regexp-error.h
new file mode 100644
index 0000000000..ff4fe41cd5
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-error.h
@@ -0,0 +1,67 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_ERROR_H_
+#define V8_REGEXP_REGEXP_ERROR_H_
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+#define REGEXP_ERROR_MESSAGES(T) \
+ T(None, "") \
+ T(StackOverflow, "Maximum call stack size exceeded") \
+ T(AnalysisStackOverflow, "Stack overflow") \
+ T(TooLarge, "Regular expression too large") \
+ T(UnterminatedGroup, "Unterminated group") \
+ T(UnmatchedParen, "Unmatched ')'") \
+ T(EscapeAtEndOfPattern, "\\ at end of pattern") \
+ T(InvalidPropertyName, "Invalid property name") \
+ T(InvalidEscape, "Invalid escape") \
+ T(InvalidDecimalEscape, "Invalid decimal escape") \
+ T(InvalidUnicodeEscape, "Invalid Unicode escape") \
+ T(NothingToRepeat, "Nothing to repeat") \
+ T(LoneQuantifierBrackets, "Lone quantifier brackets") \
+ T(RangeOutOfOrder, "numbers out of order in {} quantifier") \
+ T(IncompleteQuantifier, "Incomplete quantifier") \
+ T(InvalidQuantifier, "Invalid quantifier") \
+ T(InvalidGroup, "Invalid group") \
+ T(MultipleFlagDashes, "Multiple dashes in flag group") \
+ T(NotLinear, "Cannot be executed in linear time") \
+ T(RepeatedFlag, "Repeated flag in flag group") \
+ T(InvalidFlagGroup, "Invalid flag group") \
+ T(TooManyCaptures, "Too many captures") \
+ T(InvalidCaptureGroupName, "Invalid capture group name") \
+ T(DuplicateCaptureGroupName, "Duplicate capture group name") \
+ T(InvalidNamedReference, "Invalid named reference") \
+ T(InvalidNamedCaptureReference, "Invalid named capture referenced") \
+ T(InvalidClassEscape, "Invalid class escape") \
+ T(InvalidClassPropertyName, "Invalid property name in character class") \
+ T(InvalidCharacterClass, "Invalid character class") \
+ T(UnterminatedCharacterClass, "Unterminated character class") \
+ T(OutOfOrderCharacterClass, "Range out of order in character class") \
+ T(InvalidClassSetOperation, "Invalid set operation in character class") \
+ T(InvalidCharacterInClass, "Invalid character in character class") \
+ T(NegatedCharacterClassWithStrings, \
+ "Negated character class may contain strings")
+
+enum class RegExpError : uint32_t {
+#define TEMPLATE(NAME, STRING) k##NAME,
+ REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+ NumErrors
+};
+
+V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
+
+inline constexpr bool RegExpErrorIsStackOverflow(RegExpError error) {
+ return error == RegExpError::kStackOverflow ||
+ error == RegExpError::kAnalysisStackOverflow;
+}
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_ERROR_H_
diff --git a/js/src/irregexp/imported/regexp-interpreter.cc b/js/src/irregexp/imported/regexp-interpreter.cc
new file mode 100644
index 0000000000..859fa53c0b
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-interpreter.cc
@@ -0,0 +1,1147 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// A simple interpreter for the Irregexp byte code.
+
+#include "irregexp/imported/regexp-interpreter.h"
+
+#include "irregexp/imported/regexp-bytecodes.h"
+#include "irregexp/imported/regexp-macro-assembler.h"
+#include "irregexp/imported/regexp-stack.h" // For kMaximumStackSize.
+#include "irregexp/imported/regexp.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uchar.h"
+#endif // V8_INTL_SUPPORT
+
+// Use token threaded dispatch iff the compiler supports computed gotos and the
+// build argument v8_enable_regexp_interpreter_threaded_dispatch was set.
+#if V8_HAS_COMPUTED_GOTO && \
+ defined(V8_ENABLE_REGEXP_INTERPRETER_THREADED_DISPATCH)
+#define V8_USE_COMPUTED_GOTO 1
+#endif // V8_HAS_COMPUTED_GOTO
+
+namespace v8 {
+namespace internal {
+
+namespace {
+
+bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
+ base::Vector<const base::uc16> subject,
+ bool unicode) {
+ Address offset_a =
+ reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(from)));
+ Address offset_b =
+ reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(current)));
+ size_t length = len * base::kUC16Size;
+
+ bool result = unicode
+ ? RegExpMacroAssembler::CaseInsensitiveCompareUnicode(
+ offset_a, offset_b, length, isolate)
+ : RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(
+ offset_a, offset_b, length, isolate);
+ return result == 1;
+}
+
+bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
+ base::Vector<const uint8_t> subject, bool unicode) {
+ // For Latin1 characters the unicode flag makes no difference.
+ for (int i = 0; i < len; i++) {
+ unsigned int old_char = subject[from++];
+ unsigned int new_char = subject[current++];
+ if (old_char == new_char) continue;
+ // Convert both characters to lower case.
+ old_char |= 0x20;
+ new_char |= 0x20;
+ if (old_char != new_char) return false;
+ // Not letters in the ASCII range and Latin-1 range.
+ if (!(old_char - 'a' <= 'z' - 'a') &&
+ !(old_char - 224 <= 254 - 224 && old_char != 247)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+#ifdef DEBUG
+void MaybeTraceInterpreter(const byte* code_base, const byte* pc,
+ int stack_depth, int current_position,
+ uint32_t current_char, int bytecode_length,
+ const char* bytecode_name) {
+ if (v8_flags.trace_regexp_bytecodes) {
+ const bool printable = std::isprint(current_char);
+ const char* format =
+ printable
+ ? "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = "
+ : "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = ";
+ PrintF(format, pc - code_base, stack_depth, current_position, current_char,
+ printable ? current_char : '.');
+
+ RegExpBytecodeDisassembleSingle(code_base, pc);
+ }
+}
+#endif // DEBUG
+
+int32_t Load32Aligned(const byte* pc) {
+ DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 3);
+ return *reinterpret_cast<const int32_t*>(pc);
+}
+
+// TODO(jgruber): Rename to Load16AlignedUnsigned.
+uint32_t Load16Aligned(const byte* pc) {
+ DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
+ return *reinterpret_cast<const uint16_t*>(pc);
+}
+
+int32_t Load16AlignedSigned(const byte* pc) {
+ DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
+ return *reinterpret_cast<const int16_t*>(pc);
+}
+
+// Helpers to access the packed argument. Takes the 32 bits containing the
+// current bytecode, where the 8 LSB contain the bytecode and the rest contains
+// a packed 24-bit argument.
+// TODO(jgruber): Specify signed-ness in bytecode signature declarations, and
+// police restrictions during bytecode generation.
+int32_t LoadPacked24Signed(int32_t bytecode_and_packed_arg) {
+ return bytecode_and_packed_arg >> BYTECODE_SHIFT;
+}
+uint32_t LoadPacked24Unsigned(int32_t bytecode_and_packed_arg) {
+ return static_cast<uint32_t>(bytecode_and_packed_arg) >> BYTECODE_SHIFT;
+}
+
+// A simple abstraction over the backtracking stack used by the interpreter.
+//
+// Despite the name 'backtracking' stack, it's actually used as a generic stack
+// that stores both program counters (= offsets into the bytecode) and generic
+// integer values.
+class BacktrackStack {
+ public:
+ BacktrackStack() = default;
+ BacktrackStack(const BacktrackStack&) = delete;
+ BacktrackStack& operator=(const BacktrackStack&) = delete;
+
+ V8_WARN_UNUSED_RESULT bool push(int v) {
+ data_.emplace_back(v);
+ return (static_cast<int>(data_.size()) <= kMaxSize);
+ }
+ int peek() const {
+ DCHECK(!data_.empty());
+ return data_.back();
+ }
+ int pop() {
+ int v = peek();
+ data_.pop_back();
+ return v;
+ }
+
+ // The 'sp' is the index of the first empty element in the stack.
+ int sp() const { return static_cast<int>(data_.size()); }
+ void set_sp(int new_sp) {
+ DCHECK_LE(new_sp, sp());
+ data_.resize_no_init(new_sp);
+ }
+
+ private:
+ // Semi-arbitrary. Should be large enough for common cases to remain in the
+ // static stack-allocated backing store, but small enough not to waste space.
+ static constexpr int kStaticCapacity = 64;
+
+ using ValueT = int;
+ base::SmallVector<ValueT, kStaticCapacity> data_;
+
+ static constexpr int kMaxSize =
+ RegExpStack::kMaximumStackSize / sizeof(ValueT);
+};
+
+// Registers used during interpreter execution. These consist of output
+// registers in indices [0, output_register_count[ which will contain matcher
+// results as a {start,end} index tuple for each capture (where the whole match
+// counts as implicit capture 0); and internal registers in indices
+// [output_register_count, total_register_count[.
+class InterpreterRegisters {
+ public:
+ using RegisterT = int;
+
+ InterpreterRegisters(int total_register_count, RegisterT* output_registers,
+ int output_register_count)
+ : registers_(total_register_count),
+ output_registers_(output_registers),
+ output_register_count_(output_register_count) {
+ // TODO(jgruber): Use int32_t consistently for registers. Currently, CSA
+ // uses int32_t while runtime uses int.
+ static_assert(sizeof(int) == sizeof(int32_t));
+ DCHECK_GE(output_register_count, 2); // At least 2 for the match itself.
+ DCHECK_GE(total_register_count, output_register_count);
+ DCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount);
+ DCHECK_NOT_NULL(output_registers);
+
+ // Initialize the output register region to -1 signifying 'no match'.
+ std::memset(registers_.data(), -1,
+ output_register_count * sizeof(RegisterT));
+ }
+
+ const RegisterT& operator[](size_t index) const { return registers_[index]; }
+ RegisterT& operator[](size_t index) { return registers_[index]; }
+
+ void CopyToOutputRegisters() {
+ MemCopy(output_registers_, registers_.data(),
+ output_register_count_ * sizeof(RegisterT));
+ }
+
+ private:
+ static constexpr int kStaticCapacity = 64; // Arbitrary.
+ base::SmallVector<RegisterT, kStaticCapacity> registers_;
+ RegisterT* const output_registers_;
+ const int output_register_count_;
+};
+
+IrregexpInterpreter::Result ThrowStackOverflow(Isolate* isolate,
+ RegExp::CallOrigin call_origin) {
+ CHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
+ // We abort interpreter execution after the stack overflow is thrown, and thus
+ // allow allocation here despite the outer DisallowGarbageCollectionScope.
+ AllowGarbageCollection yes_gc;
+ isolate->StackOverflow();
+ return IrregexpInterpreter::EXCEPTION;
+}
+
+// Only throws if called from the runtime, otherwise just returns the EXCEPTION
+// status code.
+IrregexpInterpreter::Result MaybeThrowStackOverflow(
+ Isolate* isolate, RegExp::CallOrigin call_origin) {
+ if (call_origin == RegExp::CallOrigin::kFromRuntime) {
+ return ThrowStackOverflow(isolate, call_origin);
+ } else {
+ return IrregexpInterpreter::EXCEPTION;
+ }
+}
+
+template <typename Char>
+void UpdateCodeAndSubjectReferences(
+ Isolate* isolate, Handle<ByteArray> code_array,
+ Handle<String> subject_string, ByteArray* code_array_out,
+ const byte** code_base_out, const byte** pc_out, String* subject_string_out,
+ base::Vector<const Char>* subject_string_vector_out) {
+ DisallowGarbageCollection no_gc;
+
+ if (*code_base_out != code_array->GetDataStartAddress()) {
+ *code_array_out = *code_array;
+ const intptr_t pc_offset = *pc_out - *code_base_out;
+ DCHECK_GT(pc_offset, 0);
+ *code_base_out = code_array->GetDataStartAddress();
+ *pc_out = *code_base_out + pc_offset;
+ }
+
+ DCHECK(subject_string->IsFlat());
+ *subject_string_out = *subject_string;
+ *subject_string_vector_out = subject_string->GetCharVector<Char>(no_gc);
+}
+
+// Runs all pending interrupts and updates unhandlified object references if
+// necessary.
+template <typename Char>
+IrregexpInterpreter::Result HandleInterrupts(
+ Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out,
+ String* subject_string_out, const byte** code_base_out,
+ base::Vector<const Char>* subject_string_vector_out, const byte** pc_out) {
+ DisallowGarbageCollection no_gc;
+
+ StackLimitCheck check(isolate);
+ bool js_has_overflowed = check.JsHasOverflowed();
+
+ if (call_origin == RegExp::CallOrigin::kFromJs) {
+ // Direct calls from JavaScript can be interrupted in two ways:
+ // 1. A real stack overflow, in which case we let the caller throw the
+ // exception.
+ // 2. The stack guard was used to interrupt execution for another purpose,
+ // forcing the call through the runtime system.
+ if (js_has_overflowed) {
+ return IrregexpInterpreter::EXCEPTION;
+ } else if (check.InterruptRequested()) {
+ return IrregexpInterpreter::RETRY;
+ }
+ } else {
+ DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
+ // Prepare for possible GC.
+ HandleScope handles(isolate);
+ Handle<ByteArray> code_handle(*code_array_out, isolate);
+ Handle<String> subject_handle(*subject_string_out, isolate);
+
+ if (js_has_overflowed) {
+ return ThrowStackOverflow(isolate, call_origin);
+ } else if (check.InterruptRequested()) {
+ const bool was_one_byte =
+ String::IsOneByteRepresentationUnderneath(*subject_string_out);
+ Object result;
+ {
+ AllowGarbageCollection yes_gc;
+ result = isolate->stack_guard()->HandleInterrupts();
+ }
+ if (result.IsException(isolate)) {
+ return IrregexpInterpreter::EXCEPTION;
+ }
+
+ // If we changed between a LATIN1 and a UC16 string, we need to
+ // restart regexp matching with the appropriate template instantiation of
+ // RawMatch.
+ if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
+ was_one_byte) {
+ return IrregexpInterpreter::RETRY;
+ }
+
+ UpdateCodeAndSubjectReferences(
+ isolate, code_handle, subject_handle, code_array_out, code_base_out,
+ pc_out, subject_string_out, subject_string_vector_out);
+ }
+ }
+
+ return IrregexpInterpreter::SUCCESS;
+}
+
+bool CheckBitInTable(const uint32_t current_char, const byte* const table) {
+ int mask = RegExpMacroAssembler::kTableMask;
+ int b = table[(current_char & mask) >> kBitsPerByteLog2];
+ int bit = (current_char & (kBitsPerByte - 1));
+ return (b & (1 << bit)) != 0;
+}
+
+// Returns true iff 0 <= index < length.
+bool IndexIsInBounds(int index, int length) {
+ DCHECK_GE(length, 0);
+ return static_cast<uintptr_t>(index) < static_cast<uintptr_t>(length);
+}
+
+// If computed gotos are supported by the compiler, we can get addresses to
+// labels directly in C/C++. Every bytecode handler has its own label and we
+// store the addresses in a dispatch table indexed by bytecode. To execute the
+// next handler we simply jump (goto) directly to its address.
+#if V8_USE_COMPUTED_GOTO
+#define BC_LABEL(name) BC_##name:
+#define DECODE() \
+ do { \
+ next_insn = Load32Aligned(next_pc); \
+ next_handler_addr = dispatch_table[next_insn & BYTECODE_MASK]; \
+ } while (false)
+#define DISPATCH() \
+ pc = next_pc; \
+ insn = next_insn; \
+ goto* next_handler_addr
+// Without computed goto support, we fall back to a simple switch-based
+// dispatch (A large switch statement inside a loop with a case for every
+// bytecode).
+#else // V8_USE_COMPUTED_GOTO
+#define BC_LABEL(name) case BC_##name:
+#define DECODE() next_insn = Load32Aligned(next_pc)
+#define DISPATCH() \
+ pc = next_pc; \
+ insn = next_insn; \
+ goto switch_dispatch_continuation
+#endif // V8_USE_COMPUTED_GOTO
+
+// ADVANCE/SET_PC_FROM_OFFSET are separated from DISPATCH, because ideally some
+// instructions can be executed between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH.
+// We want those two macros as far apart as possible, because the goto in
+// DISPATCH is dependent on a memory load in ADVANCE/SET_PC_FROM_OFFSET. If we
+// don't hit the cache and have to fetch the next handler address from physical
+// memory, instructions between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH can
+// potentially be executed unconditionally, reducing memory stall.
+#define ADVANCE(name) \
+ next_pc = pc + RegExpBytecodeLength(BC_##name); \
+ DECODE()
+#define SET_PC_FROM_OFFSET(offset) \
+ next_pc = code_base + offset; \
+ DECODE()
+
+// Current position mutations.
+#define SET_CURRENT_POSITION(value) \
+ do { \
+ current = (value); \
+ DCHECK(base::IsInRange(current, 0, subject.length())); \
+ } while (false)
+#define ADVANCE_CURRENT_POSITION(by) SET_CURRENT_POSITION(current + (by))
+
+#ifdef DEBUG
+#define BYTECODE(name) \
+ BC_LABEL(name) \
+ MaybeTraceInterpreter(code_base, pc, backtrack_stack.sp(), current, \
+ current_char, RegExpBytecodeLength(BC_##name), #name);
+#else
+#define BYTECODE(name) BC_LABEL(name)
+#endif // DEBUG
+
+template <typename Char>
+IrregexpInterpreter::Result RawMatch(
+ Isolate* isolate, ByteArray code_array, String subject_string,
+ base::Vector<const Char> subject, int* output_registers,
+ int output_register_count, int total_register_count, int current,
+ uint32_t current_char, RegExp::CallOrigin call_origin,
+ const uint32_t backtrack_limit) {
+ DisallowGarbageCollection no_gc;
+
+#if V8_USE_COMPUTED_GOTO
+
+// We have to make sure that no OOB access to the dispatch table is possible and
+// all values are valid label addresses.
+// Otherwise jumps to arbitrary addresses could potentially happen.
+// This is ensured as follows:
+// Every index to the dispatch table gets masked using BYTECODE_MASK in
+// DECODE(). This way we can only get values between 0 (only the least
+// significant byte of an integer is used) and kRegExpPaddedBytecodeCount - 1
+// (BYTECODE_MASK is defined to be exactly this value).
+// All entries from kRegExpBytecodeCount to kRegExpPaddedBytecodeCount have to
+// be filled with BREAKs (invalid operation).
+
+// Fill dispatch table from last defined bytecode up to the next power of two
+// with BREAK (invalid operation).
+// TODO(pthier): Find a way to fill up automatically (at compile time)
+// 59 real bytecodes -> 5 fillers
+#define BYTECODE_FILLER_ITERATOR(V) \
+ V(BREAK) /* 1 */ \
+ V(BREAK) /* 2 */ \
+ V(BREAK) /* 3 */ \
+ V(BREAK) /* 4 */ \
+ V(BREAK) /* 5 */
+
+#define COUNT(...) +1
+ static constexpr int kRegExpBytecodeFillerCount =
+ BYTECODE_FILLER_ITERATOR(COUNT);
+#undef COUNT
+
+ // Make sure kRegExpPaddedBytecodeCount is actually the closest possible power
+ // of two.
+ DCHECK_EQ(kRegExpPaddedBytecodeCount,
+ base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount));
+
+ // Make sure every bytecode we get by using BYTECODE_MASK is well defined.
+ static_assert(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount);
+ static_assert(kRegExpBytecodeCount + kRegExpBytecodeFillerCount ==
+ kRegExpPaddedBytecodeCount);
+
+#define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name,
+ static const void* const dispatch_table[kRegExpPaddedBytecodeCount] = {
+ BYTECODE_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY)
+ BYTECODE_FILLER_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY)};
+#undef DECLARE_DISPATCH_TABLE_ENTRY
+#undef BYTECODE_FILLER_ITERATOR
+
+#endif // V8_USE_COMPUTED_GOTO
+
+ const byte* pc = code_array.GetDataStartAddress();
+ const byte* code_base = pc;
+
+ InterpreterRegisters registers(total_register_count, output_registers,
+ output_register_count);
+ BacktrackStack backtrack_stack;
+
+ uint32_t backtrack_count = 0;
+
+#ifdef DEBUG
+ if (v8_flags.trace_regexp_bytecodes) {
+ PrintF("\n\nStart bytecode interpreter\n\n");
+ }
+#endif
+
+ while (true) {
+ const byte* next_pc = pc;
+ int32_t insn;
+ int32_t next_insn;
+#if V8_USE_COMPUTED_GOTO
+ const void* next_handler_addr;
+ DECODE();
+ DISPATCH();
+#else
+ insn = Load32Aligned(pc);
+ switch (insn & BYTECODE_MASK) {
+#endif // V8_USE_COMPUTED_GOTO
+ BYTECODE(BREAK) { UNREACHABLE(); }
+ BYTECODE(PUSH_CP) {
+ ADVANCE(PUSH_CP);
+ if (!backtrack_stack.push(current)) {
+ return MaybeThrowStackOverflow(isolate, call_origin);
+ }
+ DISPATCH();
+ }
+ BYTECODE(PUSH_BT) {
+ ADVANCE(PUSH_BT);
+ if (!backtrack_stack.push(Load32Aligned(pc + 4))) {
+ return MaybeThrowStackOverflow(isolate, call_origin);
+ }
+ DISPATCH();
+ }
+ BYTECODE(PUSH_REGISTER) {
+ ADVANCE(PUSH_REGISTER);
+ if (!backtrack_stack.push(registers[LoadPacked24Unsigned(insn)])) {
+ return MaybeThrowStackOverflow(isolate, call_origin);
+ }
+ DISPATCH();
+ }
+ BYTECODE(SET_REGISTER) {
+ ADVANCE(SET_REGISTER);
+ registers[LoadPacked24Unsigned(insn)] = Load32Aligned(pc + 4);
+ DISPATCH();
+ }
+ BYTECODE(ADVANCE_REGISTER) {
+ ADVANCE(ADVANCE_REGISTER);
+ registers[LoadPacked24Unsigned(insn)] += Load32Aligned(pc + 4);
+ DISPATCH();
+ }
+ BYTECODE(SET_REGISTER_TO_CP) {
+ ADVANCE(SET_REGISTER_TO_CP);
+ registers[LoadPacked24Unsigned(insn)] = current + Load32Aligned(pc + 4);
+ DISPATCH();
+ }
+ BYTECODE(SET_CP_TO_REGISTER) {
+ ADVANCE(SET_CP_TO_REGISTER);
+ SET_CURRENT_POSITION(registers[LoadPacked24Unsigned(insn)]);
+ DISPATCH();
+ }
+ BYTECODE(SET_REGISTER_TO_SP) {
+ ADVANCE(SET_REGISTER_TO_SP);
+ registers[LoadPacked24Unsigned(insn)] = backtrack_stack.sp();
+ DISPATCH();
+ }
+ BYTECODE(SET_SP_TO_REGISTER) {
+ ADVANCE(SET_SP_TO_REGISTER);
+ backtrack_stack.set_sp(registers[LoadPacked24Unsigned(insn)]);
+ DISPATCH();
+ }
+ BYTECODE(POP_CP) {
+ ADVANCE(POP_CP);
+ SET_CURRENT_POSITION(backtrack_stack.pop());
+ DISPATCH();
+ }
+ BYTECODE(POP_BT) {
+ static_assert(JSRegExp::kNoBacktrackLimit == 0);
+ if (++backtrack_count == backtrack_limit) {
+ int return_code = LoadPacked24Signed(insn);
+ return static_cast<IrregexpInterpreter::Result>(return_code);
+ }
+
+ IrregexpInterpreter::Result return_code =
+ HandleInterrupts(isolate, call_origin, &code_array, &subject_string,
+ &code_base, &subject, &pc);
+ if (return_code != IrregexpInterpreter::SUCCESS) return return_code;
+
+ SET_PC_FROM_OFFSET(backtrack_stack.pop());
+ DISPATCH();
+ }
+ BYTECODE(POP_REGISTER) {
+ ADVANCE(POP_REGISTER);
+ registers[LoadPacked24Unsigned(insn)] = backtrack_stack.pop();
+ DISPATCH();
+ }
+ BYTECODE(FAIL) {
+ isolate->counters()->regexp_backtracks()->AddSample(
+ static_cast<int>(backtrack_count));
+ return IrregexpInterpreter::FAILURE;
+ }
+ BYTECODE(SUCCEED) {
+ isolate->counters()->regexp_backtracks()->AddSample(
+ static_cast<int>(backtrack_count));
+ registers.CopyToOutputRegisters();
+ return IrregexpInterpreter::SUCCESS;
+ }
+ BYTECODE(ADVANCE_CP) {
+ ADVANCE(ADVANCE_CP);
+ ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn));
+ DISPATCH();
+ }
+ BYTECODE(GOTO) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ BYTECODE(ADVANCE_CP_AND_GOTO) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn));
+ DISPATCH();
+ }
+ BYTECODE(CHECK_GREEDY) {
+ if (current == backtrack_stack.peek()) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ backtrack_stack.pop();
+ } else {
+ ADVANCE(CHECK_GREEDY);
+ }
+ DISPATCH();
+ }
+ BYTECODE(LOAD_CURRENT_CHAR) {
+ int pos = current + LoadPacked24Signed(insn);
+ if (pos >= subject.length() || pos < 0) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(LOAD_CURRENT_CHAR);
+ current_char = subject[pos];
+ }
+ DISPATCH();
+ }
+ BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
+ ADVANCE(LOAD_CURRENT_CHAR_UNCHECKED);
+ int pos = current + LoadPacked24Signed(insn);
+ current_char = subject[pos];
+ DISPATCH();
+ }
+ BYTECODE(LOAD_2_CURRENT_CHARS) {
+ int pos = current + LoadPacked24Signed(insn);
+ if (pos + 2 > subject.length() || pos < 0) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(LOAD_2_CURRENT_CHARS);
+ Char next = subject[pos + 1];
+ current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
+ }
+ DISPATCH();
+ }
+ BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
+ ADVANCE(LOAD_2_CURRENT_CHARS_UNCHECKED);
+ int pos = current + LoadPacked24Signed(insn);
+ Char next = subject[pos + 1];
+ current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
+ DISPATCH();
+ }
+ BYTECODE(LOAD_4_CURRENT_CHARS) {
+ DCHECK_EQ(1, sizeof(Char));
+ int pos = current + LoadPacked24Signed(insn);
+ if (pos + 4 > subject.length() || pos < 0) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(LOAD_4_CURRENT_CHARS);
+ Char next1 = subject[pos + 1];
+ Char next2 = subject[pos + 2];
+ Char next3 = subject[pos + 3];
+ current_char =
+ (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24));
+ }
+ DISPATCH();
+ }
+ BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
+ ADVANCE(LOAD_4_CURRENT_CHARS_UNCHECKED);
+ DCHECK_EQ(1, sizeof(Char));
+ int pos = current + LoadPacked24Signed(insn);
+ Char next1 = subject[pos + 1];
+ Char next2 = subject[pos + 2];
+ Char next3 = subject[pos + 3];
+ current_char =
+ (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24));
+ DISPATCH();
+ }
+ BYTECODE(CHECK_4_CHARS) {
+ uint32_t c = Load32Aligned(pc + 4);
+ if (c == current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(CHECK_4_CHARS);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_CHAR) {
+ uint32_t c = LoadPacked24Unsigned(insn);
+ if (c == current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_CHAR);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_4_CHARS) {
+ uint32_t c = Load32Aligned(pc + 4);
+ if (c != current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(CHECK_NOT_4_CHARS);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_CHAR) {
+ uint32_t c = LoadPacked24Unsigned(insn);
+ if (c != current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_NOT_CHAR);
+ }
+ DISPATCH();
+ }
+ BYTECODE(AND_CHECK_4_CHARS) {
+ uint32_t c = Load32Aligned(pc + 4);
+ if (c == (current_char & Load32Aligned(pc + 8))) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 12));
+ } else {
+ ADVANCE(AND_CHECK_4_CHARS);
+ }
+ DISPATCH();
+ }
+ BYTECODE(AND_CHECK_CHAR) {
+ uint32_t c = LoadPacked24Unsigned(insn);
+ if (c == (current_char & Load32Aligned(pc + 4))) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(AND_CHECK_CHAR);
+ }
+ DISPATCH();
+ }
+ BYTECODE(AND_CHECK_NOT_4_CHARS) {
+ uint32_t c = Load32Aligned(pc + 4);
+ if (c != (current_char & Load32Aligned(pc + 8))) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 12));
+ } else {
+ ADVANCE(AND_CHECK_NOT_4_CHARS);
+ }
+ DISPATCH();
+ }
+ BYTECODE(AND_CHECK_NOT_CHAR) {
+ uint32_t c = LoadPacked24Unsigned(insn);
+ if (c != (current_char & Load32Aligned(pc + 4))) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(AND_CHECK_NOT_CHAR);
+ }
+ DISPATCH();
+ }
+ BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
+ uint32_t c = LoadPacked24Unsigned(insn);
+ uint32_t minus = Load16Aligned(pc + 4);
+ uint32_t mask = Load16Aligned(pc + 6);
+ if (c != ((current_char - minus) & mask)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(MINUS_AND_CHECK_NOT_CHAR);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_CHAR_IN_RANGE) {
+ uint32_t from = Load16Aligned(pc + 4);
+ uint32_t to = Load16Aligned(pc + 6);
+ if (from <= current_char && current_char <= to) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(CHECK_CHAR_IN_RANGE);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
+ uint32_t from = Load16Aligned(pc + 4);
+ uint32_t to = Load16Aligned(pc + 6);
+ if (from > current_char || current_char > to) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(CHECK_CHAR_NOT_IN_RANGE);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_BIT_IN_TABLE) {
+ if (CheckBitInTable(current_char, pc + 8)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_BIT_IN_TABLE);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_LT) {
+ uint32_t limit = LoadPacked24Unsigned(insn);
+ if (current_char < limit) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_LT);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_GT) {
+ uint32_t limit = LoadPacked24Unsigned(insn);
+ if (current_char > limit) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_GT);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_REGISTER_LT) {
+ if (registers[LoadPacked24Unsigned(insn)] < Load32Aligned(pc + 4)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(CHECK_REGISTER_LT);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_REGISTER_GE) {
+ if (registers[LoadPacked24Unsigned(insn)] >= Load32Aligned(pc + 4)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ } else {
+ ADVANCE(CHECK_REGISTER_GE);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_REGISTER_EQ_POS) {
+ if (registers[LoadPacked24Unsigned(insn)] == current) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_REGISTER_EQ_POS);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_REGS_EQUAL) {
+ if (registers[LoadPacked24Unsigned(insn)] ==
+ registers[Load32Aligned(pc + 4)]) {
+ ADVANCE(CHECK_NOT_REGS_EQUAL);
+ } else {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_BACK_REF) {
+ int from = registers[LoadPacked24Unsigned(insn)];
+ int len = registers[LoadPacked24Unsigned(insn) + 1] - from;
+ if (from >= 0 && len > 0) {
+ if (current + len > subject.length() ||
+ !CompareCharsEqual(&subject[from], &subject[current], len)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(len);
+ }
+ ADVANCE(CHECK_NOT_BACK_REF);
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) {
+ int from = registers[LoadPacked24Unsigned(insn)];
+ int len = registers[LoadPacked24Unsigned(insn) + 1] - from;
+ if (from >= 0 && len > 0) {
+ if (current - len < 0 ||
+ !CompareCharsEqual(&subject[from], &subject[current - len], len)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ SET_CURRENT_POSITION(current - len);
+ }
+ ADVANCE(CHECK_NOT_BACK_REF_BACKWARD);
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) {
+ int from = registers[LoadPacked24Unsigned(insn)];
+ int len = registers[LoadPacked24Unsigned(insn) + 1] - from;
+ if (from >= 0 && len > 0) {
+ if (current + len > subject.length() ||
+ !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(len);
+ }
+ ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE);
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
+ int from = registers[LoadPacked24Unsigned(insn)];
+ int len = registers[LoadPacked24Unsigned(insn) + 1] - from;
+ if (from >= 0 && len > 0) {
+ if (current + len > subject.length() ||
+ !BackRefMatchesNoCase(isolate, from, current, len, subject,
+ false)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(len);
+ }
+ ADVANCE(CHECK_NOT_BACK_REF_NO_CASE);
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) {
+ int from = registers[LoadPacked24Unsigned(insn)];
+ int len = registers[LoadPacked24Unsigned(insn) + 1] - from;
+ if (from >= 0 && len > 0) {
+ if (current - len < 0 ||
+ !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
+ true)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ SET_CURRENT_POSITION(current - len);
+ }
+ ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD);
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
+ int from = registers[LoadPacked24Unsigned(insn)];
+ int len = registers[LoadPacked24Unsigned(insn) + 1] - from;
+ if (from >= 0 && len > 0) {
+ if (current - len < 0 ||
+ !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
+ false)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ DISPATCH();
+ }
+ SET_CURRENT_POSITION(current - len);
+ }
+ ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD);
+ DISPATCH();
+ }
+ BYTECODE(CHECK_AT_START) {
+ if (current + LoadPacked24Signed(insn) == 0) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_AT_START);
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_NOT_AT_START) {
+ if (current + LoadPacked24Signed(insn) == 0) {
+ ADVANCE(CHECK_NOT_AT_START);
+ } else {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ }
+ DISPATCH();
+ }
+ BYTECODE(SET_CURRENT_POSITION_FROM_END) {
+ ADVANCE(SET_CURRENT_POSITION_FROM_END);
+ int by = LoadPacked24Unsigned(insn);
+ if (subject.length() - current > by) {
+ SET_CURRENT_POSITION(subject.length() - by);
+ current_char = subject[current - 1];
+ }
+ DISPATCH();
+ }
+ BYTECODE(CHECK_CURRENT_POSITION) {
+ int pos = current + LoadPacked24Signed(insn);
+ if (pos > subject.length() || pos < 0) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
+ } else {
+ ADVANCE(CHECK_CURRENT_POSITION);
+ }
+ DISPATCH();
+ }
+ BYTECODE(SKIP_UNTIL_CHAR) {
+ int32_t load_offset = LoadPacked24Signed(insn);
+ int32_t advance = Load16AlignedSigned(pc + 4);
+ uint32_t c = Load16Aligned(pc + 6);
+ while (IndexIsInBounds(current + load_offset, subject.length())) {
+ current_char = subject[current + load_offset];
+ if (c == current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 8));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(advance);
+ }
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 12));
+ DISPATCH();
+ }
+ BYTECODE(SKIP_UNTIL_CHAR_AND) {
+ int32_t load_offset = LoadPacked24Signed(insn);
+ int32_t advance = Load16AlignedSigned(pc + 4);
+ uint16_t c = Load16Aligned(pc + 6);
+ uint32_t mask = Load32Aligned(pc + 8);
+ int32_t maximum_offset = Load32Aligned(pc + 12);
+ while (static_cast<uintptr_t>(current + maximum_offset) <=
+ static_cast<uintptr_t>(subject.length())) {
+ current_char = subject[current + load_offset];
+ if (c == (current_char & mask)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 16));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(advance);
+ }
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 20));
+ DISPATCH();
+ }
+ BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) {
+ int32_t load_offset = LoadPacked24Signed(insn);
+ int32_t advance = Load16AlignedSigned(pc + 4);
+ uint16_t c = Load16Aligned(pc + 6);
+ int32_t maximum_offset = Load32Aligned(pc + 8);
+ while (static_cast<uintptr_t>(current + maximum_offset) <=
+ static_cast<uintptr_t>(subject.length())) {
+ current_char = subject[current + load_offset];
+ if (c == current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 12));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(advance);
+ }
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 16));
+ DISPATCH();
+ }
+ BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) {
+ int32_t load_offset = LoadPacked24Signed(insn);
+ int32_t advance = Load16AlignedSigned(pc + 4);
+ const byte* table = pc + 8;
+ while (IndexIsInBounds(current + load_offset, subject.length())) {
+ current_char = subject[current + load_offset];
+ if (CheckBitInTable(current_char, table)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 24));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(advance);
+ }
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 28));
+ DISPATCH();
+ }
+ BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) {
+ int32_t load_offset = LoadPacked24Signed(insn);
+ int32_t advance = Load16AlignedSigned(pc + 4);
+ uint16_t limit = Load16Aligned(pc + 6);
+ const byte* table = pc + 8;
+ while (IndexIsInBounds(current + load_offset, subject.length())) {
+ current_char = subject[current + load_offset];
+ if (current_char > limit) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 24));
+ DISPATCH();
+ }
+ if (!CheckBitInTable(current_char, table)) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 24));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(advance);
+ }
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 28));
+ DISPATCH();
+ }
+ BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) {
+ int32_t load_offset = LoadPacked24Signed(insn);
+ int32_t advance = Load32Aligned(pc + 4);
+ uint16_t c = Load16Aligned(pc + 8);
+ uint16_t c2 = Load16Aligned(pc + 10);
+ while (IndexIsInBounds(current + load_offset, subject.length())) {
+ current_char = subject[current + load_offset];
+ // The two if-statements below are split up intentionally, as combining
+ // them seems to result in register allocation behaving quite
+ // differently and slowing down the resulting code.
+ if (c == current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 12));
+ DISPATCH();
+ }
+ if (c2 == current_char) {
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 12));
+ DISPATCH();
+ }
+ ADVANCE_CURRENT_POSITION(advance);
+ }
+ SET_PC_FROM_OFFSET(Load32Aligned(pc + 16));
+ DISPATCH();
+ }
+#if V8_USE_COMPUTED_GOTO
+// Lint gets confused a lot if we just use !V8_USE_COMPUTED_GOTO or ifndef
+// V8_USE_COMPUTED_GOTO here.
+#else
+ default:
+ UNREACHABLE();
+ }
+ // Label we jump to in DISPATCH(). There must be no instructions between the
+ // end of the switch, this label and the end of the loop.
+ switch_dispatch_continuation : {}
+#endif // V8_USE_COMPUTED_GOTO
+ }
+}
+
+#undef BYTECODE
+#undef ADVANCE_CURRENT_POSITION
+#undef SET_CURRENT_POSITION
+#undef DISPATCH
+#undef DECODE
+#undef SET_PC_FROM_OFFSET
+#undef ADVANCE
+#undef BC_LABEL
+#undef V8_USE_COMPUTED_GOTO
+
+} // namespace
+
+// static
+IrregexpInterpreter::Result IrregexpInterpreter::Match(
+ Isolate* isolate, JSRegExp regexp, String subject_string,
+ int* output_registers, int output_register_count, int start_position,
+ RegExp::CallOrigin call_origin) {
+ if (v8_flags.regexp_tier_up) regexp.TierUpTick();
+
+ bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string);
+ ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte));
+ int total_register_count = regexp.max_register_count();
+
+ return MatchInternal(isolate, code_array, subject_string, output_registers,
+ output_register_count, total_register_count,
+ start_position, call_origin, regexp.backtrack_limit());
+}
+
+IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
+ Isolate* isolate, ByteArray code_array, String subject_string,
+ int* output_registers, int output_register_count, int total_register_count,
+ int start_position, RegExp::CallOrigin call_origin,
+ uint32_t backtrack_limit) {
+ DCHECK(subject_string.IsFlat());
+
+ // TODO(chromium:1262676): Remove this CHECK once fixed.
+ CHECK(code_array.IsByteArray());
+
+ // Note: Heap allocation *is* allowed in two situations if calling from
+ // Runtime:
+ // 1. When creating & throwing a stack overflow exception. The interpreter
+ // aborts afterwards, and thus possible-moved objects are never used.
+ // 2. When handling interrupts. We manually relocate unhandlified references
+ // after interrupts have run.
+ DisallowGarbageCollection no_gc;
+
+ base::uc16 previous_char = '\n';
+ String::FlatContent subject_content = subject_string.GetFlatContent(no_gc);
+ // Because interrupts can result in GC and string content relocation, the
+ // checksum verification in FlatContent may fail even though this code is
+ // safe. See (2) above.
+ subject_content.UnsafeDisableChecksumVerification();
+ if (subject_content.IsOneByte()) {
+ base::Vector<const uint8_t> subject_vector =
+ subject_content.ToOneByteVector();
+ if (start_position != 0) previous_char = subject_vector[start_position - 1];
+ return RawMatch(isolate, code_array, subject_string, subject_vector,
+ output_registers, output_register_count,
+ total_register_count, start_position, previous_char,
+ call_origin, backtrack_limit);
+ } else {
+ DCHECK(subject_content.IsTwoByte());
+ base::Vector<const base::uc16> subject_vector =
+ subject_content.ToUC16Vector();
+ if (start_position != 0) previous_char = subject_vector[start_position - 1];
+ return RawMatch(isolate, code_array, subject_string, subject_vector,
+ output_registers, output_register_count,
+ total_register_count, start_position, previous_char,
+ call_origin, backtrack_limit);
+ }
+}
+
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
+// This method is called through an external reference from RegExpExecInternal
+// builtin.
+IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
+ Address subject, int32_t start_position, Address, Address,
+ int* output_registers, int32_t output_register_count,
+ RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp) {
+ DCHECK_NOT_NULL(isolate);
+ DCHECK_NOT_NULL(output_registers);
+ DCHECK(call_origin == RegExp::CallOrigin::kFromJs);
+
+ DisallowGarbageCollection no_gc;
+ DisallowJavascriptExecution no_js(isolate);
+ DisallowHandleAllocation no_handles;
+ DisallowHandleDereference no_deref;
+
+ String subject_string = String::cast(Object(subject));
+ JSRegExp regexp_obj = JSRegExp::cast(Object(regexp));
+
+ if (regexp_obj.MarkedForTierUp()) {
+ // Returning RETRY will re-enter through runtime, where actual recompilation
+ // for tier-up takes place.
+ return IrregexpInterpreter::RETRY;
+ }
+
+ return Match(isolate, regexp_obj, subject_string, output_registers,
+ output_register_count, start_position, call_origin);
+}
+
+#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
+IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime(
+ Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
+ int* output_registers, int output_register_count, int start_position) {
+ return Match(isolate, *regexp, *subject_string, output_registers,
+ output_register_count, start_position,
+ RegExp::CallOrigin::kFromRuntime);
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-interpreter.h b/js/src/irregexp/imported/regexp-interpreter.h
new file mode 100644
index 0000000000..bc55be2b8c
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-interpreter.h
@@ -0,0 +1,68 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// A simple interpreter for the Irregexp byte code.
+
+#ifndef V8_REGEXP_REGEXP_INTERPRETER_H_
+#define V8_REGEXP_REGEXP_INTERPRETER_H_
+
+#include "irregexp/imported/regexp.h"
+
+namespace v8 {
+namespace internal {
+
+class ByteArray;
+
+class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
+ public:
+ enum Result {
+ FAILURE = RegExp::kInternalRegExpFailure,
+ SUCCESS = RegExp::kInternalRegExpSuccess,
+ EXCEPTION = RegExp::kInternalRegExpException,
+ RETRY = RegExp::kInternalRegExpRetry,
+ FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental,
+ };
+
+ // In case a StackOverflow occurs, a StackOverflowException is created and
+ // EXCEPTION is returned.
+ static Result MatchForCallFromRuntime(
+ Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
+ int* output_registers, int output_register_count, int start_position);
+
+ // In case a StackOverflow occurs, EXCEPTION is returned. The caller is
+ // responsible for creating the exception.
+ //
+ // RETRY is returned if a retry through the runtime is needed (e.g. when
+ // interrupts have been scheduled or the regexp is marked for tier-up).
+ //
+ // Arguments input_start and input_end are unused. They are only passed to
+ // match the signature of the native irregex code.
+ //
+ // Arguments output_registers and output_register_count describe the results
+ // array, which will contain register values of all captures if SUCCESS is
+ // returned. For all other return codes, the results array remains unmodified.
+ static Result MatchForCallFromJs(Address subject, int32_t start_position,
+ Address input_start, Address input_end,
+ int* output_registers,
+ int32_t output_register_count,
+ RegExp::CallOrigin call_origin,
+ Isolate* isolate, Address regexp);
+
+ static Result MatchInternal(Isolate* isolate, ByteArray code_array,
+ String subject_string, int* output_registers,
+ int output_register_count,
+ int total_register_count, int start_position,
+ RegExp::CallOrigin call_origin,
+ uint32_t backtrack_limit);
+
+ private:
+ static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string,
+ int* output_registers, int output_register_count,
+ int start_position, RegExp::CallOrigin call_origin);
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_INTERPRETER_H_
diff --git a/js/src/irregexp/imported/regexp-macro-assembler-arch.h b/js/src/irregexp/imported/regexp-macro-assembler-arch.h
new file mode 100644
index 0000000000..a755e7c1b3
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-macro-assembler-arch.h
@@ -0,0 +1,7 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "irregexp/RegExpNativeMacroAssembler.h"
diff --git a/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc b/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc
new file mode 100644
index 0000000000..6444ca3c60
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc
@@ -0,0 +1,438 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-macro-assembler-tracer.h"
+
+
+namespace v8 {
+namespace internal {
+
+RegExpMacroAssemblerTracer::RegExpMacroAssemblerTracer(
+ Isolate* isolate, RegExpMacroAssembler* assembler)
+ : RegExpMacroAssembler(isolate, assembler->zone()), assembler_(assembler) {
+ PrintF("RegExpMacroAssembler%s();\n",
+ ImplementationToString(assembler->Implementation()));
+}
+
+RegExpMacroAssemblerTracer::~RegExpMacroAssemblerTracer() = default;
+
+void RegExpMacroAssemblerTracer::AbortedCodeGeneration() {
+ PrintF(" AbortedCodeGeneration\n");
+ assembler_->AbortedCodeGeneration();
+}
+
+
+// This is used for printing out debugging information. It makes an integer
+// that is closely related to the address of an object.
+static int LabelToInt(Label* label) {
+ return static_cast<int>(reinterpret_cast<intptr_t>(label));
+}
+
+
+void RegExpMacroAssemblerTracer::Bind(Label* label) {
+ PrintF("label[%08x]: (Bind)\n", LabelToInt(label));
+ assembler_->Bind(label);
+}
+
+
+void RegExpMacroAssemblerTracer::AdvanceCurrentPosition(int by) {
+ PrintF(" AdvanceCurrentPosition(by=%d);\n", by);
+ assembler_->AdvanceCurrentPosition(by);
+}
+
+
+void RegExpMacroAssemblerTracer::CheckGreedyLoop(Label* label) {
+ PrintF(" CheckGreedyLoop(label[%08x]);\n\n", LabelToInt(label));
+ assembler_->CheckGreedyLoop(label);
+}
+
+
+void RegExpMacroAssemblerTracer::PopCurrentPosition() {
+ PrintF(" PopCurrentPosition();\n");
+ assembler_->PopCurrentPosition();
+}
+
+
+void RegExpMacroAssemblerTracer::PushCurrentPosition() {
+ PrintF(" PushCurrentPosition();\n");
+ assembler_->PushCurrentPosition();
+}
+
+
+void RegExpMacroAssemblerTracer::Backtrack() {
+ PrintF(" Backtrack();\n");
+ assembler_->Backtrack();
+}
+
+
+void RegExpMacroAssemblerTracer::GoTo(Label* label) {
+ PrintF(" GoTo(label[%08x]);\n\n", LabelToInt(label));
+ assembler_->GoTo(label);
+}
+
+
+void RegExpMacroAssemblerTracer::PushBacktrack(Label* label) {
+ PrintF(" PushBacktrack(label[%08x]);\n", LabelToInt(label));
+ assembler_->PushBacktrack(label);
+}
+
+
+bool RegExpMacroAssemblerTracer::Succeed() {
+ bool restart = assembler_->Succeed();
+ PrintF(" Succeed();%s\n", restart ? " [restart for global match]" : "");
+ return restart;
+}
+
+
+void RegExpMacroAssemblerTracer::Fail() {
+ PrintF(" Fail();");
+ assembler_->Fail();
+}
+
+
+void RegExpMacroAssemblerTracer::PopRegister(int register_index) {
+ PrintF(" PopRegister(register=%d);\n", register_index);
+ assembler_->PopRegister(register_index);
+}
+
+
+void RegExpMacroAssemblerTracer::PushRegister(
+ int register_index,
+ StackCheckFlag check_stack_limit) {
+ PrintF(" PushRegister(register=%d, %s);\n",
+ register_index,
+ check_stack_limit ? "check stack limit" : "");
+ assembler_->PushRegister(register_index, check_stack_limit);
+}
+
+
+void RegExpMacroAssemblerTracer::AdvanceRegister(int reg, int by) {
+ PrintF(" AdvanceRegister(register=%d, by=%d);\n", reg, by);
+ assembler_->AdvanceRegister(reg, by);
+}
+
+
+void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) {
+ PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by);
+ assembler_->SetCurrentPositionFromEnd(by);
+}
+
+
+void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) {
+ PrintF(" SetRegister(register=%d, to=%d);\n", register_index, to);
+ assembler_->SetRegister(register_index, to);
+}
+
+
+void RegExpMacroAssemblerTracer::WriteCurrentPositionToRegister(int reg,
+ int cp_offset) {
+ PrintF(" WriteCurrentPositionToRegister(register=%d,cp_offset=%d);\n",
+ reg,
+ cp_offset);
+ assembler_->WriteCurrentPositionToRegister(reg, cp_offset);
+}
+
+
+void RegExpMacroAssemblerTracer::ClearRegisters(int reg_from, int reg_to) {
+ PrintF(" ClearRegister(from=%d, to=%d);\n", reg_from, reg_to);
+ assembler_->ClearRegisters(reg_from, reg_to);
+}
+
+
+void RegExpMacroAssemblerTracer::ReadCurrentPositionFromRegister(int reg) {
+ PrintF(" ReadCurrentPositionFromRegister(register=%d);\n", reg);
+ assembler_->ReadCurrentPositionFromRegister(reg);
+}
+
+
+void RegExpMacroAssemblerTracer::WriteStackPointerToRegister(int reg) {
+ PrintF(" WriteStackPointerToRegister(register=%d);\n", reg);
+ assembler_->WriteStackPointerToRegister(reg);
+}
+
+
+void RegExpMacroAssemblerTracer::ReadStackPointerFromRegister(int reg) {
+ PrintF(" ReadStackPointerFromRegister(register=%d);\n", reg);
+ assembler_->ReadStackPointerFromRegister(reg);
+}
+
+void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl(
+ int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
+ int eats_at_least) {
+ const char* check_msg = check_bounds ? "" : " (unchecked)";
+ PrintF(
+ " LoadCurrentCharacter(cp_offset=%d, label[%08x]%s (%d chars) (eats at "
+ "least %d));\n",
+ cp_offset, LabelToInt(on_end_of_input), check_msg, characters,
+ eats_at_least);
+ assembler_->LoadCurrentCharacter(cp_offset, on_end_of_input, check_bounds,
+ characters, eats_at_least);
+}
+
+namespace {
+
+class PrintablePrinter {
+ public:
+ explicit PrintablePrinter(base::uc16 character) : character_(character) {}
+
+ const char* operator*() {
+ if (character_ >= ' ' && character_ <= '~') {
+ buffer_[0] = '(';
+ buffer_[1] = static_cast<char>(character_);
+ buffer_[2] = ')';
+ buffer_[3] = '\0';
+ } else {
+ buffer_[0] = '\0';
+ }
+ return &buffer_[0];
+ }
+
+ private:
+ base::uc16 character_;
+ char buffer_[4];
+};
+
+} // namespace
+
+void RegExpMacroAssemblerTracer::CheckCharacterLT(base::uc16 limit,
+ Label* on_less) {
+ PrintablePrinter printable(limit);
+ PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n",
+ limit,
+ *printable,
+ LabelToInt(on_less));
+ assembler_->CheckCharacterLT(limit, on_less);
+}
+
+void RegExpMacroAssemblerTracer::CheckCharacterGT(base::uc16 limit,
+ Label* on_greater) {
+ PrintablePrinter printable(limit);
+ PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n",
+ limit,
+ *printable,
+ LabelToInt(on_greater));
+ assembler_->CheckCharacterGT(limit, on_greater);
+}
+
+void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) {
+ PrintablePrinter printable(c);
+ PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n",
+ c,
+ *printable,
+ LabelToInt(on_equal));
+ assembler_->CheckCharacter(c, on_equal);
+}
+
+void RegExpMacroAssemblerTracer::CheckAtStart(int cp_offset,
+ Label* on_at_start) {
+ PrintF(" CheckAtStart(cp_offset=%d, label[%08x]);\n", cp_offset,
+ LabelToInt(on_at_start));
+ assembler_->CheckAtStart(cp_offset, on_at_start);
+}
+
+void RegExpMacroAssemblerTracer::CheckNotAtStart(int cp_offset,
+ Label* on_not_at_start) {
+ PrintF(" CheckNotAtStart(cp_offset=%d, label[%08x]);\n", cp_offset,
+ LabelToInt(on_not_at_start));
+ assembler_->CheckNotAtStart(cp_offset, on_not_at_start);
+}
+
+
+void RegExpMacroAssemblerTracer::CheckNotCharacter(unsigned c,
+ Label* on_not_equal) {
+ PrintablePrinter printable(c);
+ PrintF(" CheckNotCharacter(c=0x%04x%s, label[%08x]);\n",
+ c,
+ *printable,
+ LabelToInt(on_not_equal));
+ assembler_->CheckNotCharacter(c, on_not_equal);
+}
+
+
+void RegExpMacroAssemblerTracer::CheckCharacterAfterAnd(
+ unsigned c,
+ unsigned mask,
+ Label* on_equal) {
+ PrintablePrinter printable(c);
+ PrintF(" CheckCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n",
+ c,
+ *printable,
+ mask,
+ LabelToInt(on_equal));
+ assembler_->CheckCharacterAfterAnd(c, mask, on_equal);
+}
+
+
+void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd(
+ unsigned c,
+ unsigned mask,
+ Label* on_not_equal) {
+ PrintablePrinter printable(c);
+ PrintF(" CheckNotCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n",
+ c,
+ *printable,
+ mask,
+ LabelToInt(on_not_equal));
+ assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal);
+}
+
+void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
+ base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) {
+ PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, "
+ "label[%08x]);\n",
+ c,
+ minus,
+ mask,
+ LabelToInt(on_not_equal));
+ assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal);
+}
+
+void RegExpMacroAssemblerTracer::CheckCharacterInRange(base::uc16 from,
+ base::uc16 to,
+ Label* on_not_in_range) {
+ PrintablePrinter printable_from(from);
+ PrintablePrinter printable_to(to);
+ PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n",
+ from,
+ *printable_from,
+ to,
+ *printable_to,
+ LabelToInt(on_not_in_range));
+ assembler_->CheckCharacterInRange(from, to, on_not_in_range);
+}
+
+void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(base::uc16 from,
+ base::uc16 to,
+ Label* on_in_range) {
+ PrintablePrinter printable_from(from);
+ PrintablePrinter printable_to(to);
+ PrintF(
+ " CheckCharacterNotInRange(from=0x%04x%s," " to=%04x%s, label[%08x]);\n",
+ from,
+ *printable_from,
+ to,
+ *printable_to,
+ LabelToInt(on_in_range));
+ assembler_->CheckCharacterNotInRange(from, to, on_in_range);
+}
+
+namespace {
+
+void PrintRangeArray(const ZoneList<CharacterRange>* ranges) {
+ for (int i = 0; i < ranges->length(); i++) {
+ base::uc16 from = ranges->at(i).from();
+ base::uc16 to = ranges->at(i).to();
+ PrintablePrinter printable_from(from);
+ PrintablePrinter printable_to(to);
+ PrintF(" [from=0x%04x%s, to=%04x%s],\n", from, *printable_from, to,
+ *printable_to);
+ }
+}
+
+} // namespace
+
+bool RegExpMacroAssemblerTracer::CheckCharacterInRangeArray(
+ const ZoneList<CharacterRange>* ranges, Label* on_in_range) {
+ PrintF(
+ " CheckCharacterInRangeArray(\n"
+ " label[%08x]);\n",
+ LabelToInt(on_in_range));
+ PrintRangeArray(ranges);
+ return assembler_->CheckCharacterInRangeArray(ranges, on_in_range);
+}
+
+bool RegExpMacroAssemblerTracer::CheckCharacterNotInRangeArray(
+ const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) {
+ PrintF(
+ " CheckCharacterNotInRangeArray(\n"
+ " label[%08x]);\n",
+ LabelToInt(on_not_in_range));
+ PrintRangeArray(ranges);
+ return assembler_->CheckCharacterNotInRangeArray(ranges, on_not_in_range);
+}
+
+void RegExpMacroAssemblerTracer::CheckBitInTable(
+ Handle<ByteArray> table, Label* on_bit_set) {
+ PrintF(" CheckBitInTable(label[%08x] ", LabelToInt(on_bit_set));
+ for (int i = 0; i < kTableSize; i++) {
+ PrintF("%c", table->get(i) != 0 ? 'X' : '.');
+ if (i % 32 == 31 && i != kTableMask) {
+ PrintF("\n ");
+ }
+ }
+ PrintF(");\n");
+ assembler_->CheckBitInTable(table, on_bit_set);
+}
+
+
+void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
+ bool read_backward,
+ Label* on_no_match) {
+ PrintF(" CheckNotBackReference(register=%d, %s, label[%08x]);\n", start_reg,
+ read_backward ? "backward" : "forward", LabelToInt(on_no_match));
+ assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match);
+}
+
+void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
+ int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
+ start_reg, read_backward ? "backward" : "forward",
+ unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
+ assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
+ on_no_match);
+}
+
+void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
+ Label* on_outside_input) {
+ PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset,
+ LabelToInt(on_outside_input));
+ assembler_->CheckPosition(cp_offset, on_outside_input);
+}
+
+bool RegExpMacroAssemblerTracer::CheckSpecialClassRanges(
+ StandardCharacterSet type, Label* on_no_match) {
+ bool supported = assembler_->CheckSpecialClassRanges(type, on_no_match);
+ PrintF(" CheckSpecialClassRanges(type='%c', label[%08x]): %s;\n",
+ static_cast<char>(type), LabelToInt(on_no_match),
+ supported ? "true" : "false");
+ return supported;
+}
+
+void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index,
+ int comparand, Label* if_lt) {
+ PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n",
+ register_index, comparand, LabelToInt(if_lt));
+ assembler_->IfRegisterLT(register_index, comparand, if_lt);
+}
+
+
+void RegExpMacroAssemblerTracer::IfRegisterEqPos(int register_index,
+ Label* if_eq) {
+ PrintF(" IfRegisterEqPos(register=%d, label[%08x]);\n",
+ register_index, LabelToInt(if_eq));
+ assembler_->IfRegisterEqPos(register_index, if_eq);
+}
+
+
+void RegExpMacroAssemblerTracer::IfRegisterGE(int register_index,
+ int comparand, Label* if_ge) {
+ PrintF(" IfRegisterGE(register=%d, number=%d, label[%08x]);\n",
+ register_index, comparand, LabelToInt(if_ge));
+ assembler_->IfRegisterGE(register_index, comparand, if_ge);
+}
+
+
+RegExpMacroAssembler::IrregexpImplementation
+ RegExpMacroAssemblerTracer::Implementation() {
+ return assembler_->Implementation();
+}
+
+
+Handle<HeapObject> RegExpMacroAssemblerTracer::GetCode(Handle<String> source) {
+ PrintF(" GetCode(%s);\n", source->ToCString().get());
+ return assembler_->GetCode(source);
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-macro-assembler-tracer.h b/js/src/irregexp/imported/regexp-macro-assembler-tracer.h
new file mode 100644
index 0000000000..3fadf1a893
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-macro-assembler-tracer.h
@@ -0,0 +1,90 @@
+// Copyright 2008 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
+#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
+
+#include "irregexp/imported/regexp-macro-assembler.h"
+
+namespace v8 {
+namespace internal {
+
+// Decorator on a RegExpMacroAssembler that write all calls.
+class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
+ public:
+ RegExpMacroAssemblerTracer(Isolate* isolate, RegExpMacroAssembler* assembler);
+ ~RegExpMacroAssemblerTracer() override;
+ void AbortedCodeGeneration() override;
+ int stack_limit_slack() override { return assembler_->stack_limit_slack(); }
+ bool CanReadUnaligned() const override {
+ return assembler_->CanReadUnaligned();
+ }
+ void AdvanceCurrentPosition(int by) override; // Signed cp change.
+ void AdvanceRegister(int reg, int by) override; // r[reg] += by.
+ void Backtrack() override;
+ void Bind(Label* label) override;
+ void CheckCharacter(unsigned c, Label* on_equal) override;
+ void CheckCharacterAfterAnd(unsigned c, unsigned and_with,
+ Label* on_equal) override;
+ void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
+ void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
+ void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
+ void CheckAtStart(int cp_offset, Label* on_at_start) override;
+ void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
+ void CheckNotBackReference(int start_reg, bool read_backward,
+ Label* on_no_match) override;
+ void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
+ bool unicode,
+ Label* on_no_match) override;
+ void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
+ void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
+ Label* on_not_equal) override;
+ void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
+ base::uc16 and_with,
+ Label* on_not_equal) override;
+ void CheckCharacterInRange(base::uc16 from, base::uc16 to,
+ Label* on_in_range) override;
+ void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
+ Label* on_not_in_range) override;
+ bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
+ Label* on_in_range) override;
+ bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
+ Label* on_not_in_range) override;
+ void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
+ void CheckPosition(int cp_offset, Label* on_outside_input) override;
+ bool CheckSpecialClassRanges(StandardCharacterSet type,
+ Label* on_no_match) override;
+ void Fail() override;
+ Handle<HeapObject> GetCode(Handle<String> source) override;
+ void GoTo(Label* label) override;
+ void IfRegisterGE(int reg, int comparand, Label* if_ge) override;
+ void IfRegisterLT(int reg, int comparand, Label* if_lt) override;
+ void IfRegisterEqPos(int reg, Label* if_eq) override;
+ IrregexpImplementation Implementation() override;
+ void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
+ bool check_bounds, int characters,
+ int eats_at_least) override;
+ void PopCurrentPosition() override;
+ void PopRegister(int register_index) override;
+ void PushBacktrack(Label* label) override;
+ void PushCurrentPosition() override;
+ void PushRegister(int register_index,
+ StackCheckFlag check_stack_limit) override;
+ void ReadCurrentPositionFromRegister(int reg) override;
+ void ReadStackPointerFromRegister(int reg) override;
+ void SetCurrentPositionFromEnd(int by) override;
+ void SetRegister(int register_index, int to) override;
+ bool Succeed() override;
+ void WriteCurrentPositionToRegister(int reg, int cp_offset) override;
+ void ClearRegisters(int reg_from, int reg_to) override;
+ void WriteStackPointerToRegister(int reg) override;
+
+ private:
+ RegExpMacroAssembler* assembler_;
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
diff --git a/js/src/irregexp/imported/regexp-macro-assembler.cc b/js/src/irregexp/imported/regexp-macro-assembler.cc
new file mode 100644
index 0000000000..0592338229
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-macro-assembler.cc
@@ -0,0 +1,520 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-macro-assembler.h"
+
+#include "irregexp/imported/regexp-stack.h"
+#include "irregexp/imported/special-case.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uchar.h"
+#include "unicode/unistr.h"
+#endif // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
+ : slow_safe_compiler_(false),
+ backtrack_limit_(JSRegExp::kNoBacktrackLimit),
+ global_mode_(NOT_GLOBAL),
+ isolate_(isolate),
+ zone_(zone) {}
+
+bool RegExpMacroAssembler::has_backtrack_limit() const {
+ return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
+}
+
+// static
+int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
+ Address byte_offset2,
+ size_t byte_length,
+ Isolate* isolate) {
+#ifdef V8_INTL_SUPPORT
+ // This function is not allowed to cause a garbage collection.
+ // A GC might move the calling generated code and invalidate the
+ // return address on the stack.
+ DisallowGarbageCollection no_gc;
+ DCHECK_EQ(0, byte_length % 2);
+ size_t length = byte_length / 2;
+ base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1);
+ base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2);
+
+ for (size_t i = 0; i < length; i++) {
+ UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
+ UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]);
+ if (c1 != c2) {
+ return 0;
+ }
+ }
+ return 1;
+#else
+ return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length,
+ isolate);
+#endif
+}
+
+// static
+int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
+ Address byte_offset2,
+ size_t byte_length,
+ Isolate* isolate) {
+ // This function is not allowed to cause a garbage collection.
+ // A GC might move the calling generated code and invalidate the
+ // return address on the stack.
+ DisallowGarbageCollection no_gc;
+ DCHECK_EQ(0, byte_length % 2);
+
+#ifdef V8_INTL_SUPPORT
+ int32_t length = static_cast<int32_t>(byte_length >> 1);
+ icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
+ length);
+ return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
+ length, U_FOLD_CASE_DEFAULT) == 0;
+#else
+ base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1);
+ base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2);
+ size_t length = byte_length >> 1;
+ DCHECK_NOT_NULL(isolate);
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+ isolate->regexp_macro_assembler_canonicalize();
+ for (size_t i = 0; i < length; i++) {
+ unibrow::uchar c1 = substring1[i];
+ unibrow::uchar c2 = substring2[i];
+ if (c1 != c2) {
+ unibrow::uchar s1[1] = {c1};
+ canonicalize->get(c1, '\0', s1);
+ if (s1[0] != c2) {
+ unibrow::uchar s2[1] = {c2};
+ canonicalize->get(c2, '\0', s2);
+ if (s1[0] != s2[0]) {
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+#endif // V8_INTL_SUPPORT
+}
+
+namespace {
+
+uint32_t Hash(const ZoneList<CharacterRange>* ranges) {
+ size_t seed = 0;
+ for (int i = 0; i < ranges->length(); i++) {
+ const CharacterRange& r = ranges->at(i);
+ seed = base::hash_combine(seed, r.from(), r.to());
+ }
+ return static_cast<uint32_t>(seed);
+}
+
+constexpr base::uc32 MaskEndOfRangeMarker(base::uc32 c) {
+ // CharacterRanges may use 0x10ffff as the end-of-range marker irrespective
+ // of whether the regexp IsUnicode or not; translate the marker value here.
+ DCHECK_IMPLIES(c > kMaxUInt16, c == String::kMaxCodePoint);
+ return c & 0xffff;
+}
+
+int RangeArrayLengthFor(const ZoneList<CharacterRange>* ranges) {
+ const int ranges_length = ranges->length();
+ return MaskEndOfRangeMarker(ranges->at(ranges_length - 1).to()) == kMaxUInt16
+ ? ranges_length * 2 - 1
+ : ranges_length * 2;
+}
+
+bool Equals(const ZoneList<CharacterRange>* lhs,
+ const Handle<FixedUInt16Array>& rhs) {
+ const int rhs_length = rhs->length();
+ if (rhs_length != RangeArrayLengthFor(lhs)) return false;
+ for (int i = 0; i < lhs->length(); i++) {
+ const CharacterRange& r = lhs->at(i);
+ if (rhs->get(i * 2 + 0) != r.from()) return false;
+ if (i * 2 + 1 == rhs_length) break;
+ if (rhs->get(i * 2 + 1) != r.to() + 1) return false;
+ }
+ return true;
+}
+
+Handle<FixedUInt16Array> MakeRangeArray(
+ Isolate* isolate, const ZoneList<CharacterRange>* ranges) {
+ const int ranges_length = ranges->length();
+ const int range_array_length = RangeArrayLengthFor(ranges);
+ Handle<FixedUInt16Array> range_array =
+ FixedUInt16Array::New(isolate, range_array_length);
+ for (int i = 0; i < ranges_length; i++) {
+ const CharacterRange& r = ranges->at(i);
+ DCHECK_LE(r.from(), kMaxUInt16);
+ range_array->set(i * 2 + 0, r.from());
+ const base::uc32 to = MaskEndOfRangeMarker(r.to());
+ if (i == ranges_length - 1 && to == kMaxUInt16) {
+ DCHECK_EQ(range_array_length, ranges_length * 2 - 1);
+ break; // Avoid overflow by leaving the last range open-ended.
+ }
+ DCHECK_LT(to, kMaxUInt16);
+ range_array->set(i * 2 + 1, to + 1); // Exclusive.
+ }
+ return range_array;
+}
+
+} // namespace
+
+Handle<ByteArray> NativeRegExpMacroAssembler::GetOrAddRangeArray(
+ const ZoneList<CharacterRange>* ranges) {
+ const uint32_t hash = Hash(ranges);
+
+ if (range_array_cache_.count(hash) != 0) {
+ Handle<FixedUInt16Array> range_array = range_array_cache_[hash];
+ if (Equals(ranges, range_array)) return range_array;
+ }
+
+ Handle<FixedUInt16Array> range_array = MakeRangeArray(isolate(), ranges);
+ range_array_cache_[hash] = range_array;
+ return range_array;
+}
+
+// static
+uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char,
+ Address raw_byte_array,
+ Isolate* isolate) {
+ // Use uint32_t to avoid complexity around bool return types (which may be
+ // optimized to use only the least significant byte).
+ static constexpr uint32_t kTrue = 1;
+ static constexpr uint32_t kFalse = 0;
+
+ FixedUInt16Array ranges = FixedUInt16Array::cast(Object(raw_byte_array));
+ DCHECK_GE(ranges.length(), 1);
+
+ // Shortcut for fully out of range chars.
+ if (current_char < ranges.get(0)) return kFalse;
+ if (current_char >= ranges.get(ranges.length() - 1)) {
+ // The last range may be open-ended.
+ return (ranges.length() % 2) == 0 ? kFalse : kTrue;
+ }
+
+ // Binary search for the matching range. `ranges` is encoded as
+ // [from0, to0, from1, to1, ..., fromN, toN], or
+ // [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
+
+ int mid, lower = 0, upper = ranges.length();
+ do {
+ mid = lower + (upper - lower) / 2;
+ const base::uc16 elem = ranges.get(mid);
+ if (current_char < elem) {
+ upper = mid;
+ } else if (current_char > elem) {
+ lower = mid + 1;
+ } else {
+ DCHECK_EQ(current_char, elem);
+ break;
+ }
+ } while (lower < upper);
+
+ const bool current_char_ge_last_elem = current_char >= ranges.get(mid);
+ const int current_range_start_index =
+ current_char_ge_last_elem ? mid : mid - 1;
+
+ // Ranges start at even indices and end at odd indices.
+ return (current_range_start_index % 2) == 0 ? kTrue : kFalse;
+}
+
+void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
+ Label* on_failure) {
+ Label ok;
+ // Check that current character is not a trail surrogate.
+ LoadCurrentCharacter(cp_offset, &ok);
+ CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
+ // Check that previous character is not a lead surrogate.
+ LoadCurrentCharacter(cp_offset - 1, &ok);
+ CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
+ Bind(&ok);
+}
+
+void RegExpMacroAssembler::CheckPosition(int cp_offset,
+ Label* on_outside_input) {
+ LoadCurrentCharacter(cp_offset, on_outside_input, true);
+}
+
+void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
+ Label* on_end_of_input,
+ bool check_bounds,
+ int characters,
+ int eats_at_least) {
+ // By default, eats_at_least = characters.
+ if (eats_at_least == kUseCharactersValue) {
+ eats_at_least = characters;
+ }
+
+ LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
+ eats_at_least);
+}
+
+void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
+ int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
+ int eats_at_least) {
+ // It's possible to preload a small number of characters when each success
+ // path requires a large number of characters, but not the reverse.
+ DCHECK_GE(eats_at_least, characters);
+
+ DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
+ if (check_bounds) {
+ if (cp_offset >= 0) {
+ CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
+ } else {
+ CheckPosition(cp_offset, on_end_of_input);
+ }
+ }
+ LoadCurrentCharacterUnchecked(cp_offset, characters);
+}
+
+bool NativeRegExpMacroAssembler::CanReadUnaligned() const {
+ return v8_flags.enable_regexp_unaligned_accesses && !slow_safe();
+}
+
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
+// This method may only be called after an interrupt.
+// static
+int NativeRegExpMacroAssembler::CheckStackGuardState(
+ Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
+ Address* return_address, InstructionStream re_code, Address* subject,
+ const byte** input_start, const byte** input_end) {
+ DisallowGarbageCollection no_gc;
+ Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
+ DCHECK_LE(re_code.instruction_start(), old_pc);
+ DCHECK_LE(old_pc, re_code.code(kAcquireLoad).instruction_end());
+
+ StackLimitCheck check(isolate);
+ bool js_has_overflowed = check.JsHasOverflowed();
+
+ if (call_origin == RegExp::CallOrigin::kFromJs) {
+ // Direct calls from JavaScript can be interrupted in two ways:
+ // 1. A real stack overflow, in which case we let the caller throw the
+ // exception.
+ // 2. The stack guard was used to interrupt execution for another purpose,
+ // forcing the call through the runtime system.
+
+ // Bug(v8:9540) Investigate why this method is called from JS although no
+ // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
+ // to continue execution normally.
+ if (js_has_overflowed) {
+ return EXCEPTION;
+ } else if (check.InterruptRequested()) {
+ return RETRY;
+ } else {
+ return 0;
+ }
+ }
+ DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
+
+ // Prepare for possible GC.
+ HandleScope handles(isolate);
+ Handle<InstructionStream> code_handle(re_code, isolate);
+ Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
+ bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
+ int return_value = 0;
+
+ {
+ DisableGCMole no_gc_mole;
+ if (js_has_overflowed) {
+ AllowGarbageCollection yes_gc;
+ isolate->StackOverflow();
+ return_value = EXCEPTION;
+ } else if (check.InterruptRequested()) {
+ AllowGarbageCollection yes_gc;
+ Object result = isolate->stack_guard()->HandleInterrupts();
+ if (result.IsException(isolate)) return_value = EXCEPTION;
+ }
+
+ // We are not using operator == here because it does a slow DCHECK
+ // CheckObjectComparisonAllowed() which might crash when trying to access
+ // the page header of the stale pointer.
+ if (!code_handle->SafeEquals(re_code)) { // Return address no longer valid
+ // Overwrite the return address on the stack.
+ intptr_t delta = code_handle->address() - re_code.address();
+ Address new_pc = old_pc + delta;
+ // TODO(v8:10026): avoid replacing a signed pointer.
+ PointerAuthentication::ReplacePC(return_address, new_pc, 0);
+ }
+ }
+
+ // If we continue, we need to update the subject string addresses.
+ if (return_value == 0) {
+ // String encoding might have changed.
+ if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
+ is_one_byte) {
+ // If we changed between an LATIN1 and an UC16 string, the specialized
+ // code cannot be used, and we need to restart regexp matching from
+ // scratch (including, potentially, compiling a new version of the code).
+ return_value = RETRY;
+ } else {
+ *subject = subject_handle->ptr();
+ intptr_t byte_length = *input_end - *input_start;
+ *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
+ *input_end = *input_start + byte_length;
+ }
+ }
+ return return_value;
+}
+
+// Returns a {Result} sentinel, or the number of successful matches.
+int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
+ Handle<String> subject,
+ int* offsets_vector,
+ int offsets_vector_length,
+ int previous_index, Isolate* isolate) {
+ DCHECK(subject->IsFlat());
+ DCHECK_LE(0, previous_index);
+ DCHECK_LE(previous_index, subject->length());
+
+ // No allocations before calling the regexp, but we can't use
+ // DisallowGarbageCollection, since regexps might be preempted, and another
+ // thread might do allocation anyway.
+
+ String subject_ptr = *subject;
+ // Character offsets into string.
+ int start_offset = previous_index;
+ int char_length = subject_ptr.length() - start_offset;
+ int slice_offset = 0;
+
+ // The string has been flattened, so if it is a cons string it contains the
+ // full string in the first part.
+ if (StringShape(subject_ptr).IsCons()) {
+ DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
+ subject_ptr = ConsString::cast(subject_ptr).first();
+ } else if (StringShape(subject_ptr).IsSliced()) {
+ SlicedString slice = SlicedString::cast(subject_ptr);
+ subject_ptr = slice.parent();
+ slice_offset = slice.offset();
+ }
+ if (StringShape(subject_ptr).IsThin()) {
+ subject_ptr = ThinString::cast(subject_ptr).actual();
+ }
+ // Ensure that an underlying string has the same representation.
+ bool is_one_byte = subject_ptr.IsOneByteRepresentation();
+ DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
+ // String is now either Sequential or External
+ int char_size_shift = is_one_byte ? 0 : 1;
+
+ DisallowGarbageCollection no_gc;
+ const byte* input_start =
+ subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
+ int byte_length = char_length << char_size_shift;
+ const byte* input_end = input_start + byte_length;
+ return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
+ offsets_vector_length, isolate, *regexp);
+}
+
+// static
+int NativeRegExpMacroAssembler::ExecuteForTesting(
+ String input, int start_offset, const byte* input_start,
+ const byte* input_end, int* output, int output_size, Isolate* isolate,
+ JSRegExp regexp) {
+ return Execute(input, start_offset, input_start, input_end, output,
+ output_size, isolate, regexp);
+}
+
+// Returns a {Result} sentinel, or the number of successful matches.
+// TODO(pthier): The JSRegExp object is passed to native irregexp code to match
+// the signature of the interpreter. We should get rid of JS objects passed to
+// internal methods.
+int NativeRegExpMacroAssembler::Execute(
+ String input, // This needs to be the unpacked (sliced, cons) string.
+ int start_offset, const byte* input_start, const byte* input_end,
+ int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
+ RegExpStackScope stack_scope(isolate);
+
+ bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
+ Code code = Code::cast(regexp.code(is_one_byte));
+ RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
+
+ using RegexpMatcherSig =
+ // NOLINTNEXTLINE(readability/casting)
+ int(Address input_string, int start_offset, const byte* input_start,
+ const byte* input_end, int* output, int output_size, int call_origin,
+ Isolate* isolate, Address regexp);
+
+ auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(isolate, code);
+ int result = fn.Call(input.ptr(), start_offset, input_start, input_end,
+ output, output_size, call_origin, isolate, regexp.ptr());
+ DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
+
+ if (result == EXCEPTION && !isolate->has_pending_exception()) {
+ // We detected a stack overflow (on the backtrack stack) in RegExp code,
+ // but haven't created the exception yet. Additionally, we allow heap
+ // allocation because even though it invalidates {input_start} and
+ // {input_end}, we are about to return anyway.
+ AllowGarbageCollection allow_allocation;
+ isolate->StackOverflow();
+ }
+ return result;
+}
+
+#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
+// clang-format off
+const byte NativeRegExpMacroAssembler::word_character_map[] = {
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
+ 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
+
+ 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
+ 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
+ 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
+ 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
+
+ 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
+ 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
+ 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
+ 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
+ // Latin-1 range
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+};
+// clang-format on
+
+// static
+Address NativeRegExpMacroAssembler::GrowStack(Isolate* isolate) {
+ DisallowGarbageCollection no_gc;
+
+ RegExpStack* regexp_stack = isolate->regexp_stack();
+ const size_t old_size = regexp_stack->memory_size();
+
+#ifdef DEBUG
+ const Address old_stack_top = regexp_stack->memory_top();
+ const Address old_stack_pointer = regexp_stack->stack_pointer();
+ CHECK_LE(old_stack_pointer, old_stack_top);
+ CHECK_LE(static_cast<size_t>(old_stack_top - old_stack_pointer), old_size);
+#endif // DEBUG
+
+ Address new_stack_base = regexp_stack->EnsureCapacity(old_size * 2);
+ if (new_stack_base == kNullAddress) return kNullAddress;
+
+ return regexp_stack->stack_pointer();
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-macro-assembler.h b/js/src/irregexp/imported/regexp-macro-assembler.h
new file mode 100644
index 0000000000..651f6cb580
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-macro-assembler.h
@@ -0,0 +1,361 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
+#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
+
+#include "irregexp/imported/regexp-ast.h"
+#include "irregexp/imported/regexp.h"
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class ByteArray;
+class JSRegExp;
+class Label;
+class String;
+
+static const base::uc32 kLeadSurrogateStart = 0xd800;
+static const base::uc32 kLeadSurrogateEnd = 0xdbff;
+static const base::uc32 kTrailSurrogateStart = 0xdc00;
+static const base::uc32 kTrailSurrogateEnd = 0xdfff;
+static const base::uc32 kNonBmpStart = 0x10000;
+static const base::uc32 kNonBmpEnd = 0x10ffff;
+
+class RegExpMacroAssembler {
+ public:
+ // The implementation must be able to handle at least:
+ static constexpr int kMaxRegisterCount = (1 << 16);
+ static constexpr int kMaxRegister = kMaxRegisterCount - 1;
+ static constexpr int kMaxCaptures = (kMaxRegister - 1) / 2;
+ static constexpr int kMaxCPOffset = (1 << 15) - 1;
+ static constexpr int kMinCPOffset = -(1 << 15);
+
+ static constexpr int kTableSizeBits = 7;
+ static constexpr int kTableSize = 1 << kTableSizeBits;
+ static constexpr int kTableMask = kTableSize - 1;
+
+ static constexpr int kUseCharactersValue = -1;
+
+ RegExpMacroAssembler(Isolate* isolate, Zone* zone);
+ virtual ~RegExpMacroAssembler() = default;
+
+ virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
+
+ // This function is called when code generation is aborted, so that
+ // the assembler could clean up internal data structures.
+ virtual void AbortedCodeGeneration() {}
+ // The maximal number of pushes between stack checks. Users must supply
+ // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
+ // at least once for every stack_limit() pushes that are executed.
+ virtual int stack_limit_slack() = 0;
+ virtual bool CanReadUnaligned() const = 0;
+
+ virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
+ virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
+ // Continues execution from the position pushed on the top of the backtrack
+ // stack by an earlier PushBacktrack(Label*).
+ virtual void Backtrack() = 0;
+ virtual void Bind(Label* label) = 0;
+ // Dispatch after looking the current character up in a 2-bits-per-entry
+ // map. The destinations vector has up to 4 labels.
+ virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
+ // Bitwise and the current character with the given constant and then
+ // check for a match with c.
+ virtual void CheckCharacterAfterAnd(unsigned c,
+ unsigned and_with,
+ Label* on_equal) = 0;
+ virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0;
+ virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0;
+ virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
+ virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
+ virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
+ virtual void CheckNotBackReference(int start_reg, bool read_backward,
+ Label* on_no_match) = 0;
+ virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
+ bool read_backward, bool unicode,
+ Label* on_no_match) = 0;
+ // Check the current character for a match with a literal character. If we
+ // fail to match then goto the on_failure label. End of input always
+ // matches. If the label is nullptr then we should pop a backtrack address
+ // off the stack and go to that.
+ virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
+ virtual void CheckNotCharacterAfterAnd(unsigned c,
+ unsigned and_with,
+ Label* on_not_equal) = 0;
+ // Subtract a constant from the current character, then and with the given
+ // constant and then check for a match with c.
+ virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
+ base::uc16 and_with,
+ Label* on_not_equal) = 0;
+ virtual void CheckCharacterInRange(base::uc16 from,
+ base::uc16 to, // Both inclusive.
+ Label* on_in_range) = 0;
+ virtual void CheckCharacterNotInRange(base::uc16 from,
+ base::uc16 to, // Both inclusive.
+ Label* on_not_in_range) = 0;
+ // Returns true if the check was emitted, false otherwise.
+ virtual bool CheckCharacterInRangeArray(
+ const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0;
+ virtual bool CheckCharacterNotInRangeArray(
+ const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0;
+
+ // The current character (modulus the kTableSize) is looked up in the byte
+ // array, and if the found byte is non-zero, we jump to the on_bit_set label.
+ virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
+
+ // Checks whether the given offset from the current position is before
+ // the end of the string. May overwrite the current character.
+ virtual void CheckPosition(int cp_offset, Label* on_outside_input);
+ // Check whether a standard/default character class matches the current
+ // character. Returns false if the type of special character class does
+ // not have custom support.
+ // May clobber the current loaded character.
+ virtual bool CheckSpecialClassRanges(StandardCharacterSet type,
+ Label* on_no_match) {
+ return false;
+ }
+
+ // Control-flow integrity:
+ // Define a jump target and bind a label.
+ virtual void BindJumpTarget(Label* label) { Bind(label); }
+
+ virtual void Fail() = 0;
+ virtual void GoTo(Label* label) = 0;
+ // Check whether a register is >= a given constant and go to a label if it
+ // is. Backtracks instead if the label is nullptr.
+ virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
+ // Check whether a register is < a given constant and go to a label if it is.
+ // Backtracks instead if the label is nullptr.
+ virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
+ // Check whether a register is == to the current position and go to a
+ // label if it is.
+ virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
+ V8_EXPORT_PRIVATE void LoadCurrentCharacter(
+ int cp_offset, Label* on_end_of_input, bool check_bounds = true,
+ int characters = 1, int eats_at_least = kUseCharactersValue);
+ virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
+ bool check_bounds, int characters,
+ int eats_at_least) = 0;
+ virtual void PopCurrentPosition() = 0;
+ virtual void PopRegister(int register_index) = 0;
+ // Pushes the label on the backtrack stack, so that a following Backtrack
+ // will go to this label. Always checks the backtrack stack limit.
+ virtual void PushBacktrack(Label* label) = 0;
+ virtual void PushCurrentPosition() = 0;
+ enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true };
+ virtual void PushRegister(int register_index,
+ StackCheckFlag check_stack_limit) = 0;
+ virtual void ReadCurrentPositionFromRegister(int reg) = 0;
+ virtual void ReadStackPointerFromRegister(int reg) = 0;
+ virtual void SetCurrentPositionFromEnd(int by) = 0;
+ virtual void SetRegister(int register_index, int to) = 0;
+ // Return whether the matching (with a global regexp) will be restarted.
+ virtual bool Succeed() = 0;
+ virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
+ virtual void ClearRegisters(int reg_from, int reg_to) = 0;
+ virtual void WriteStackPointerToRegister(int reg) = 0;
+
+ // Check that we are not in the middle of a surrogate pair.
+ void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
+
+#define IMPLEMENTATIONS_LIST(V) \
+ V(IA32) \
+ V(ARM) \
+ V(ARM64) \
+ V(MIPS) \
+ V(LOONG64) \
+ V(RISCV) \
+ V(RISCV32) \
+ V(S390) \
+ V(PPC) \
+ V(X64) \
+ V(Bytecode)
+
+ enum IrregexpImplementation {
+#define V(Name) k##Name##Implementation,
+ IMPLEMENTATIONS_LIST(V)
+#undef V
+ };
+
+ inline const char* ImplementationToString(IrregexpImplementation impl) {
+ static const char* const kNames[] = {
+#define V(Name) #Name,
+ IMPLEMENTATIONS_LIST(V)
+#undef V
+ };
+ return kNames[impl];
+ }
+#undef IMPLEMENTATIONS_LIST
+ virtual IrregexpImplementation Implementation() = 0;
+
+ // Compare two-byte strings case insensitively.
+ //
+ // Called from generated code.
+ static int CaseInsensitiveCompareNonUnicode(Address byte_offset1,
+ Address byte_offset2,
+ size_t byte_length,
+ Isolate* isolate);
+ static int CaseInsensitiveCompareUnicode(Address byte_offset1,
+ Address byte_offset2,
+ size_t byte_length,
+ Isolate* isolate);
+
+ // `raw_byte_array` is a ByteArray containing a set of character ranges,
+ // where ranges are encoded as uint16_t elements:
+ //
+ // [from0, to0, from1, to1, ..., fromN, toN], or
+ // [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
+ //
+ // fromN is inclusive, toN is exclusive. Returns zero if not in a range,
+ // non-zero otherwise.
+ //
+ // Called from generated code.
+ static uint32_t IsCharacterInRangeArray(uint32_t current_char,
+ Address raw_byte_array,
+ Isolate* isolate);
+
+ // Controls the generation of large inlined constants in the code.
+ void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
+ bool slow_safe() const { return slow_safe_compiler_; }
+
+ // Controls after how many backtracks irregexp should abort execution. If it
+ // can fall back to the experimental engine (see `set_can_fallback`), it will
+ // return the appropriate error code, otherwise it will return the number of
+ // matches found so far (perhaps none).
+ void set_backtrack_limit(uint32_t backtrack_limit) {
+ backtrack_limit_ = backtrack_limit;
+ }
+
+ // Set whether or not irregexp can fall back to the experimental engine on
+ // excessive backtracking. The number of backtracks considered excessive can
+ // be controlled with set_backtrack_limit.
+ void set_can_fallback(bool val) { can_fallback_ = val; }
+
+ enum GlobalMode {
+ NOT_GLOBAL,
+ GLOBAL_NO_ZERO_LENGTH_CHECK,
+ GLOBAL,
+ GLOBAL_UNICODE
+ };
+ // Set whether the regular expression has the global flag. Exiting due to
+ // a failure in a global regexp may still mean success overall.
+ inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
+ inline bool global() const { return global_mode_ != NOT_GLOBAL; }
+ inline bool global_with_zero_length_check() const {
+ return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
+ }
+ inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; }
+
+ Isolate* isolate() const { return isolate_; }
+ Zone* zone() const { return zone_; }
+
+ protected:
+ bool has_backtrack_limit() const;
+ uint32_t backtrack_limit() const { return backtrack_limit_; }
+
+ bool can_fallback() const { return can_fallback_; }
+
+ private:
+ bool slow_safe_compiler_;
+ uint32_t backtrack_limit_;
+ bool can_fallback_ = false;
+ GlobalMode global_mode_;
+ Isolate* const isolate_;
+ Zone* const zone_;
+};
+
+class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
+ public:
+ // Type of input string to generate code for.
+ enum Mode { LATIN1 = 1, UC16 = 2 };
+
+ // Result of calling generated native RegExp code.
+ // RETRY: Something significant changed during execution, and the matching
+ // should be retried from scratch.
+ // EXCEPTION: Something failed during execution. If no exception has been
+ // thrown, it's an internal out-of-memory, and the caller should
+ // throw the exception.
+ // FAILURE: Matching failed.
+ // SUCCESS: Matching succeeded, and the output array has been filled with
+ // capture positions.
+ // FALLBACK_TO_EXPERIMENTAL: Execute the regexp on this subject using the
+ // experimental engine instead.
+ enum Result {
+ FAILURE = RegExp::kInternalRegExpFailure,
+ SUCCESS = RegExp::kInternalRegExpSuccess,
+ EXCEPTION = RegExp::kInternalRegExpException,
+ RETRY = RegExp::kInternalRegExpRetry,
+ FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental,
+ SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult,
+ };
+
+ NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone)
+ : RegExpMacroAssembler(isolate, zone), range_array_cache_(zone) {}
+ ~NativeRegExpMacroAssembler() override = default;
+
+ // Returns a {Result} sentinel, or the number of successful matches.
+ static int Match(Handle<JSRegExp> regexp, Handle<String> subject,
+ int* offsets_vector, int offsets_vector_length,
+ int previous_index, Isolate* isolate);
+
+ V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset,
+ const byte* input_start,
+ const byte* input_end,
+ int* output, int output_size,
+ Isolate* isolate,
+ JSRegExp regexp);
+
+ bool CanReadUnaligned() const override;
+
+ void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
+ bool check_bounds, int characters,
+ int eats_at_least) override;
+ // Load a number of characters at the given offset from the
+ // current position, into the current-character register.
+ virtual void LoadCurrentCharacterUnchecked(int cp_offset,
+ int character_count) = 0;
+
+ // Called from RegExp if the backtrack stack limit is hit. Tries to expand
+ // the stack. Returns the new stack-pointer if successful, or returns 0 if
+ // unable to grow the stack.
+ // This function must not trigger a garbage collection.
+ //
+ // Called from generated code.
+ static Address GrowStack(Isolate* isolate);
+
+ // Called from generated code.
+ static int CheckStackGuardState(Isolate* isolate, int start_index,
+ RegExp::CallOrigin call_origin,
+ Address* return_address,
+ InstructionStream re_code, Address* subject,
+ const byte** input_start,
+ const byte** input_end);
+
+ static Address word_character_map_address() {
+ return reinterpret_cast<Address>(&word_character_map[0]);
+ }
+
+ protected:
+ // Byte map of one byte characters with a 0xff if the character is a word
+ // character (digit, letter or underscore) and 0x00 otherwise.
+ // Used by generated RegExp code.
+ static const byte word_character_map[256];
+
+ Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges);
+
+ private:
+ // Returns a {Result} sentinel, or the number of successful matches.
+ static int Execute(String input, int start_offset, const byte* input_start,
+ const byte* input_end, int* output, int output_size,
+ Isolate* isolate, JSRegExp regexp);
+
+ ZoneUnorderedMap<uint32_t, Handle<FixedUInt16Array>> range_array_cache_;
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
diff --git a/js/src/irregexp/imported/regexp-nodes.h b/js/src/irregexp/imported/regexp-nodes.h
new file mode 100644
index 0000000000..9407f1c5ec
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-nodes.h
@@ -0,0 +1,775 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_NODES_H_
+#define V8_REGEXP_REGEXP_NODES_H_
+
+#include "irregexp/imported/regexp-macro-assembler.h"
+
+namespace v8 {
+namespace internal {
+
+class AlternativeGenerationList;
+class BoyerMooreLookahead;
+class GreedyLoopState;
+class NodeVisitor;
+class QuickCheckDetails;
+class RegExpCompiler;
+class Trace;
+struct PreloadState;
+class ChoiceNode;
+
+#define FOR_EACH_NODE_TYPE(VISIT) \
+ VISIT(End) \
+ VISIT(Action) \
+ VISIT(Choice) \
+ VISIT(LoopChoice) \
+ VISIT(NegativeLookaroundChoice) \
+ VISIT(BackReference) \
+ VISIT(Assertion) \
+ VISIT(Text)
+
+struct NodeInfo final {
+ NodeInfo()
+ : being_analyzed(false),
+ been_analyzed(false),
+ follows_word_interest(false),
+ follows_newline_interest(false),
+ follows_start_interest(false),
+ at_end(false),
+ visited(false),
+ replacement_calculated(false) {}
+
+ // Returns true if the interests and assumptions of this node
+ // matches the given one.
+ bool Matches(NodeInfo* that) {
+ return (at_end == that->at_end) &&
+ (follows_word_interest == that->follows_word_interest) &&
+ (follows_newline_interest == that->follows_newline_interest) &&
+ (follows_start_interest == that->follows_start_interest);
+ }
+
+ // Updates the interests of this node given the interests of the
+ // node preceding it.
+ void AddFromPreceding(NodeInfo* that) {
+ at_end |= that->at_end;
+ follows_word_interest |= that->follows_word_interest;
+ follows_newline_interest |= that->follows_newline_interest;
+ follows_start_interest |= that->follows_start_interest;
+ }
+
+ bool HasLookbehind() {
+ return follows_word_interest || follows_newline_interest ||
+ follows_start_interest;
+ }
+
+ // Sets the interests of this node to include the interests of the
+ // following node.
+ void AddFromFollowing(NodeInfo* that) {
+ follows_word_interest |= that->follows_word_interest;
+ follows_newline_interest |= that->follows_newline_interest;
+ follows_start_interest |= that->follows_start_interest;
+ }
+
+ void ResetCompilationState() {
+ being_analyzed = false;
+ been_analyzed = false;
+ }
+
+ bool being_analyzed : 1;
+ bool been_analyzed : 1;
+
+ // These bits are set of this node has to know what the preceding
+ // character was.
+ bool follows_word_interest : 1;
+ bool follows_newline_interest : 1;
+ bool follows_start_interest : 1;
+
+ bool at_end : 1;
+ bool visited : 1;
+ bool replacement_calculated : 1;
+};
+
+struct EatsAtLeastInfo final {
+ EatsAtLeastInfo() : EatsAtLeastInfo(0) {}
+ explicit EatsAtLeastInfo(uint8_t eats)
+ : eats_at_least_from_possibly_start(eats),
+ eats_at_least_from_not_start(eats) {}
+ void SetMin(const EatsAtLeastInfo& other) {
+ if (other.eats_at_least_from_possibly_start <
+ eats_at_least_from_possibly_start) {
+ eats_at_least_from_possibly_start =
+ other.eats_at_least_from_possibly_start;
+ }
+ if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) {
+ eats_at_least_from_not_start = other.eats_at_least_from_not_start;
+ }
+ }
+
+ bool IsZero() const {
+ return eats_at_least_from_possibly_start == 0 &&
+ eats_at_least_from_not_start == 0;
+ }
+
+ // Any successful match starting from the current node will consume at least
+ // this many characters. This does not necessarily mean that there is a
+ // possible match with exactly this many characters, but we generally try to
+ // get this number as high as possible to allow for early exit on failure.
+ uint8_t eats_at_least_from_possibly_start;
+
+ // Like eats_at_least_from_possibly_start, but with the additional assumption
+ // that start-of-string assertions (^) can't match. This value is greater than
+ // or equal to eats_at_least_from_possibly_start.
+ uint8_t eats_at_least_from_not_start;
+};
+
+class RegExpNode : public ZoneObject {
+ public:
+ explicit RegExpNode(Zone* zone)
+ : replacement_(nullptr),
+ on_work_list_(false),
+ trace_count_(0),
+ zone_(zone) {
+ bm_info_[0] = bm_info_[1] = nullptr;
+ }
+ virtual ~RegExpNode();
+ virtual void Accept(NodeVisitor* visitor) = 0;
+ // Generates a goto to this node or actually generates the code at this point.
+ virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
+ // How many characters must this node consume at a minimum in order to
+ // succeed. The not_at_start argument is used to indicate that we know we are
+ // not at the start of the input. In this case anchored branches will always
+ // fail and can be ignored when determining how many characters are consumed
+ // on success. If this node has not been analyzed yet, EatsAtLeast returns 0.
+ int EatsAtLeast(bool not_at_start);
+ // Returns how many characters this node must consume in order to succeed,
+ // given that this is a LoopChoiceNode whose counter register is in a
+ // newly-initialized state at the current position in the generated code. For
+ // example, consider /a{6,8}/. Absent any extra information, the
+ // LoopChoiceNode for the repetition must report that it consumes at least
+ // zero characters, because it may have already looped several times. However,
+ // with a newly-initialized counter, it can report that it consumes at least
+ // six characters.
+ virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry();
+ // Emits some quick code that checks whether the preloaded characters match.
+ // Falls through on certain failure, jumps to the label on possible success.
+ // If the node cannot make a quick check it does nothing and returns false.
+ bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace,
+ Trace* trace, bool preload_has_checked_bounds,
+ Label* on_possible_success,
+ QuickCheckDetails* details_return,
+ bool fall_through_on_failure, ChoiceNode* predecessor);
+ // For a given number of characters this returns a mask and a value. The
+ // next n characters are anded with the mask and compared with the value.
+ // A comparison failure indicates the node cannot match the next n characters.
+ // A comparison success indicates the node may match.
+ virtual void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start) = 0;
+ // Fills in quick check details for this node, given that this is a
+ // LoopChoiceNode whose counter register is in a newly-initialized state at
+ // the current position in the generated code. For example, consider /a{6,8}/.
+ // Absent any extra information, the LoopChoiceNode for the repetition cannot
+ // generate any useful quick check because a match might be the (empty)
+ // continuation node. However, with a newly-initialized counter, it can
+ // generate a quick check for several 'a' characters at once.
+ virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start);
+ static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
+ virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
+ // Only returns the successor for a text node of length 1 that matches any
+ // character and that has no guards on it.
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) {
+ return nullptr;
+ }
+
+ // Collects information on the possible code units (mod 128) that can match if
+ // we look forward. This is used for a Boyer-Moore-like string searching
+ // implementation. TODO(erikcorry): This should share more code with
+ // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
+ // the number of nodes we are willing to look at in order to create this data.
+ static const int kRecursionBudget = 200;
+ bool KeepRecursing(RegExpCompiler* compiler);
+ virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) {
+ UNREACHABLE();
+ }
+
+ // If we know that the input is one-byte then there are some nodes that can
+ // never match. This method returns a node that can be substituted for
+ // itself, or nullptr if the node can never match.
+ virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) {
+ return this;
+ }
+ // Helper for FilterOneByte.
+ RegExpNode* replacement() {
+ DCHECK(info()->replacement_calculated);
+ return replacement_;
+ }
+ RegExpNode* set_replacement(RegExpNode* replacement) {
+ info()->replacement_calculated = true;
+ replacement_ = replacement;
+ return replacement; // For convenience.
+ }
+
+ // We want to avoid recalculating the lookahead info, so we store it on the
+ // node. Only info that is for this node is stored. We can tell that the
+ // info is for this node when offset == 0, so the information is calculated
+ // relative to this node.
+ void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
+ if (offset == 0) set_bm_info(not_at_start, bm);
+ }
+
+ Label* label() { return &label_; }
+ // If non-generic code is generated for a node (i.e. the node is not at the
+ // start of the trace) then it cannot be reused. This variable sets a limit
+ // on how often we allow that to happen before we insist on starting a new
+ // trace and generating generic code for a node that can be reused by flushing
+ // the deferred actions in the current trace and generating a goto.
+ static const int kMaxCopiesCodeGenerated = 10;
+
+ bool on_work_list() { return on_work_list_; }
+ void set_on_work_list(bool value) { on_work_list_ = value; }
+
+ NodeInfo* info() { return &info_; }
+ const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; }
+ void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) {
+ eats_at_least_ = eats_at_least;
+ }
+
+ // TODO(v8:10441): This is a hacky way to avoid exponential code size growth
+ // for very large choice nodes that can be generated by unicode property
+ // escapes. In order to avoid inlining (i.e. trace recursion), we pretend to
+ // have generated the maximum count of code copies already.
+ // We should instead fix this properly, e.g. by using the code size budget
+ // (flush_budget) or by generating property escape matches as calls to a C
+ // function.
+ void SetDoNotInline() { trace_count_ = kMaxCopiesCodeGenerated; }
+
+ BoyerMooreLookahead* bm_info(bool not_at_start) {
+ return bm_info_[not_at_start ? 1 : 0];
+ }
+
+ Zone* zone() const { return zone_; }
+
+ protected:
+ enum LimitResult { DONE, CONTINUE };
+ RegExpNode* replacement_;
+
+ LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
+
+ void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
+ bm_info_[not_at_start ? 1 : 0] = bm;
+ }
+
+ private:
+ static const int kFirstCharBudget = 10;
+ Label label_;
+ bool on_work_list_;
+ NodeInfo info_;
+
+ // Saved values for EatsAtLeast results, to avoid recomputation. Filled in
+ // during analysis (valid if info_.been_analyzed is true).
+ EatsAtLeastInfo eats_at_least_;
+
+ // This variable keeps track of how many times code has been generated for
+ // this node (in different traces). We don't keep track of where the
+ // generated code is located unless the code is generated at the start of
+ // a trace, in which case it is generic and can be reused by flushing the
+ // deferred operations in the current trace and generating a goto.
+ int trace_count_;
+ BoyerMooreLookahead* bm_info_[2];
+
+ Zone* zone_;
+};
+
+class SeqRegExpNode : public RegExpNode {
+ public:
+ explicit SeqRegExpNode(RegExpNode* on_success)
+ : RegExpNode(on_success->zone()), on_success_(on_success) {}
+ RegExpNode* on_success() { return on_success_; }
+ void set_on_success(RegExpNode* node) { on_success_ = node; }
+ RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override {
+ on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+ if (offset == 0) set_bm_info(not_at_start, bm);
+ }
+
+ protected:
+ RegExpNode* FilterSuccessor(int depth, RegExpFlags flags);
+
+ private:
+ RegExpNode* on_success_;
+};
+
+class ActionNode : public SeqRegExpNode {
+ public:
+ enum ActionType {
+ SET_REGISTER_FOR_LOOP,
+ INCREMENT_REGISTER,
+ STORE_POSITION,
+ BEGIN_POSITIVE_SUBMATCH,
+ BEGIN_NEGATIVE_SUBMATCH,
+ POSITIVE_SUBMATCH_SUCCESS,
+ EMPTY_MATCH_CHECK,
+ CLEAR_CAPTURES
+ };
+ static ActionNode* SetRegisterForLoop(int reg, int val,
+ RegExpNode* on_success);
+ static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
+ static ActionNode* StorePosition(int reg, bool is_capture,
+ RegExpNode* on_success);
+ static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
+ static ActionNode* BeginPositiveSubmatch(int stack_pointer_reg,
+ int position_reg,
+ RegExpNode* on_success);
+ static ActionNode* BeginNegativeSubmatch(int stack_pointer_reg,
+ int position_reg,
+ RegExpNode* on_success);
+ static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
+ int restore_reg,
+ int clear_capture_count,
+ int clear_capture_from,
+ RegExpNode* on_success);
+ static ActionNode* EmptyMatchCheck(int start_register,
+ int repetition_register,
+ int repetition_limit,
+ RegExpNode* on_success);
+ void Accept(NodeVisitor* visitor) override;
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int filled_in,
+ bool not_at_start) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override;
+ ActionType action_type() { return action_type_; }
+ // TODO(erikcorry): We should allow some action nodes in greedy loops.
+ int GreedyLoopTextLength() override {
+ return kNodeIsTooComplexForGreedyLoops;
+ }
+
+ private:
+ union {
+ struct {
+ int reg;
+ int value;
+ } u_store_register;
+ struct {
+ int reg;
+ } u_increment_register;
+ struct {
+ int reg;
+ bool is_capture;
+ } u_position_register;
+ struct {
+ int stack_pointer_register;
+ int current_position_register;
+ int clear_register_count;
+ int clear_register_from;
+ } u_submatch;
+ struct {
+ int start_register;
+ int repetition_register;
+ int repetition_limit;
+ } u_empty_match_check;
+ struct {
+ int range_from;
+ int range_to;
+ } u_clear_captures;
+ } data_;
+ ActionNode(ActionType action_type, RegExpNode* on_success)
+ : SeqRegExpNode(on_success), action_type_(action_type) {}
+ ActionType action_type_;
+ friend class DotPrinterImpl;
+ friend Zone;
+};
+
+class TextNode : public SeqRegExpNode {
+ public:
+ TextNode(ZoneList<TextElement>* elms, bool read_backward,
+ RegExpNode* on_success)
+ : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
+ TextNode(RegExpClassRanges* that, bool read_backward, RegExpNode* on_success)
+ : SeqRegExpNode(on_success),
+ elms_(zone()->New<ZoneList<TextElement>>(1, zone())),
+ read_backward_(read_backward) {
+ elms_->Add(TextElement::ClassRanges(that), zone());
+ }
+ // Create TextNode for a single character class for the given ranges.
+ static TextNode* CreateForCharacterRanges(Zone* zone,
+ ZoneList<CharacterRange>* ranges,
+ bool read_backward,
+ RegExpNode* on_success);
+ // Create TextNode for a surrogate pair (i.e. match a sequence of two uc16
+ // code unit ranges).
+ static TextNode* CreateForSurrogatePair(
+ Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
+ bool read_backward, RegExpNode* on_success);
+ static TextNode* CreateForSurrogatePair(Zone* zone,
+ ZoneList<CharacterRange>* lead_ranges,
+ CharacterRange trail,
+ bool read_backward,
+ RegExpNode* on_success);
+ void Accept(NodeVisitor* visitor) override;
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int characters_filled_in,
+ bool not_at_start) override;
+ ZoneList<TextElement>* elements() { return elms_; }
+ bool read_backward() { return read_backward_; }
+ void MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
+ RegExpFlags flags);
+ int GreedyLoopTextLength() override;
+ RegExpNode* GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override;
+ void CalculateOffsets();
+ RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
+ int Length();
+
+ private:
+ enum TextEmitPassType {
+ NON_LATIN1_MATCH, // Check for characters that can't match.
+ SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
+ NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
+ CASE_CHARACTER_MATCH, // Case-independent single character check.
+ CHARACTER_CLASS_MATCH // Character class.
+ };
+ static bool SkipPass(TextEmitPassType pass, bool ignore_case);
+ static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
+ static const int kLastPass = CHARACTER_CLASS_MATCH;
+ void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
+ bool preloaded, Trace* trace, bool first_element_checked,
+ int* checked_up_to);
+ ZoneList<TextElement>* elms_;
+ bool read_backward_;
+};
+
+class AssertionNode : public SeqRegExpNode {
+ public:
+ enum AssertionType {
+ AT_END,
+ AT_START,
+ AT_BOUNDARY,
+ AT_NON_BOUNDARY,
+ AFTER_NEWLINE
+ };
+ static AssertionNode* AtEnd(RegExpNode* on_success) {
+ return on_success->zone()->New<AssertionNode>(AT_END, on_success);
+ }
+ static AssertionNode* AtStart(RegExpNode* on_success) {
+ return on_success->zone()->New<AssertionNode>(AT_START, on_success);
+ }
+ static AssertionNode* AtBoundary(RegExpNode* on_success) {
+ return on_success->zone()->New<AssertionNode>(AT_BOUNDARY, on_success);
+ }
+ static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
+ return on_success->zone()->New<AssertionNode>(AT_NON_BOUNDARY, on_success);
+ }
+ static AssertionNode* AfterNewline(RegExpNode* on_success) {
+ return on_success->zone()->New<AssertionNode>(AFTER_NEWLINE, on_success);
+ }
+ void Accept(NodeVisitor* visitor) override;
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int filled_in,
+ bool not_at_start) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override;
+ AssertionType assertion_type() { return assertion_type_; }
+
+ private:
+ friend Zone;
+
+ void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
+ enum IfPrevious { kIsNonWord, kIsWord };
+ void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace,
+ IfPrevious backtrack_if_previous);
+ AssertionNode(AssertionType t, RegExpNode* on_success)
+ : SeqRegExpNode(on_success), assertion_type_(t) {}
+ AssertionType assertion_type_;
+};
+
+class BackReferenceNode : public SeqRegExpNode {
+ public:
+ BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags,
+ bool read_backward, RegExpNode* on_success)
+ : SeqRegExpNode(on_success),
+ start_reg_(start_reg),
+ end_reg_(end_reg),
+ flags_(flags),
+ read_backward_(read_backward) {}
+ void Accept(NodeVisitor* visitor) override;
+ int start_register() { return start_reg_; }
+ int end_register() { return end_reg_; }
+ bool read_backward() { return read_backward_; }
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int characters_filled_in,
+ bool not_at_start) override {
+ return;
+ }
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override;
+
+ private:
+ int start_reg_;
+ int end_reg_;
+ RegExpFlags flags_;
+ bool read_backward_;
+};
+
+class EndNode : public RegExpNode {
+ public:
+ enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
+ EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
+ void Accept(NodeVisitor* visitor) override;
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int characters_filled_in,
+ bool not_at_start) override {
+ // Returning 0 from EatsAtLeast should ensure we never get here.
+ UNREACHABLE();
+ }
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override {
+ // Returning 0 from EatsAtLeast should ensure we never get here.
+ UNREACHABLE();
+ }
+
+ private:
+ Action action_;
+};
+
+class NegativeSubmatchSuccess : public EndNode {
+ public:
+ NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg,
+ int clear_capture_count, int clear_capture_start,
+ Zone* zone)
+ : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
+ stack_pointer_register_(stack_pointer_reg),
+ current_position_register_(position_reg),
+ clear_capture_count_(clear_capture_count),
+ clear_capture_start_(clear_capture_start) {}
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+
+ private:
+ int stack_pointer_register_;
+ int current_position_register_;
+ int clear_capture_count_;
+ int clear_capture_start_;
+};
+
+class Guard : public ZoneObject {
+ public:
+ enum Relation { LT, GEQ };
+ Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {}
+ int reg() { return reg_; }
+ Relation op() { return op_; }
+ int value() { return value_; }
+
+ private:
+ int reg_;
+ Relation op_;
+ int value_;
+};
+
+class GuardedAlternative {
+ public:
+ explicit GuardedAlternative(RegExpNode* node)
+ : node_(node), guards_(nullptr) {}
+ void AddGuard(Guard* guard, Zone* zone);
+ RegExpNode* node() { return node_; }
+ void set_node(RegExpNode* node) { node_ = node; }
+ ZoneList<Guard*>* guards() { return guards_; }
+
+ private:
+ RegExpNode* node_;
+ ZoneList<Guard*>* guards_;
+};
+
+class AlternativeGeneration;
+
+class ChoiceNode : public RegExpNode {
+ public:
+ explicit ChoiceNode(int expected_size, Zone* zone)
+ : RegExpNode(zone),
+ alternatives_(
+ zone->New<ZoneList<GuardedAlternative>>(expected_size, zone)),
+ not_at_start_(false),
+ being_calculated_(false) {}
+ void Accept(NodeVisitor* visitor) override;
+ void AddAlternative(GuardedAlternative node) {
+ alternatives()->Add(node, zone());
+ }
+ ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int characters_filled_in,
+ bool not_at_start) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override;
+
+ bool being_calculated() { return being_calculated_; }
+ bool not_at_start() { return not_at_start_; }
+ void set_not_at_start() { not_at_start_ = true; }
+ void set_being_calculated(bool b) { being_calculated_ = b; }
+ virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
+ return true;
+ }
+ RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
+ virtual bool read_backward() { return false; }
+
+ protected:
+ int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
+ ZoneList<GuardedAlternative>* alternatives_;
+
+ private:
+ template <typename...>
+ friend class Analysis;
+
+ void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard,
+ Trace* trace);
+ int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
+ void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace,
+ GuardedAlternative alternative,
+ AlternativeGeneration* alt_gen,
+ int preload_characters,
+ bool next_expects_preload);
+ void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
+ PreloadState* preloads);
+ void AssertGuardsMentionRegisters(Trace* trace);
+ int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
+ Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
+ AlternativeGenerationList* alt_gens,
+ PreloadState* preloads,
+ GreedyLoopState* greedy_loop_state, int text_length);
+ void EmitChoices(RegExpCompiler* compiler,
+ AlternativeGenerationList* alt_gens, int first_choice,
+ Trace* trace, PreloadState* preloads);
+
+ // If true, this node is never checked at the start of the input.
+ // Allows a new trace to start with at_start() set to false.
+ bool not_at_start_;
+ bool being_calculated_;
+};
+
+class NegativeLookaroundChoiceNode : public ChoiceNode {
+ public:
+ explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
+ GuardedAlternative then_do_this,
+ Zone* zone)
+ : ChoiceNode(2, zone) {
+ AddAlternative(this_must_fail);
+ AddAlternative(then_do_this);
+ }
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int characters_filled_in,
+ bool not_at_start) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override {
+ continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm,
+ not_at_start);
+ if (offset == 0) set_bm_info(not_at_start, bm);
+ }
+ static constexpr int kLookaroundIndex = 0;
+ static constexpr int kContinueIndex = 1;
+ RegExpNode* lookaround_node() {
+ return alternatives()->at(kLookaroundIndex).node();
+ }
+ RegExpNode* continue_node() {
+ return alternatives()->at(kContinueIndex).node();
+ }
+ // For a negative lookahead we don't emit the quick check for the
+ // alternative that is expected to fail. This is because quick check code
+ // starts by loading enough characters for the alternative that takes fewest
+ // characters, but on a negative lookahead the negative branch did not take
+ // part in that calculation (EatsAtLeast) so the assumptions don't hold.
+ bool try_to_emit_quick_check_for_alternative(bool is_first) override {
+ return !is_first;
+ }
+ void Accept(NodeVisitor* visitor) override;
+ RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
+};
+
+class LoopChoiceNode : public ChoiceNode {
+ public:
+ LoopChoiceNode(bool body_can_be_zero_length, bool read_backward,
+ int min_loop_iterations, Zone* zone)
+ : ChoiceNode(2, zone),
+ loop_node_(nullptr),
+ continue_node_(nullptr),
+ body_can_be_zero_length_(body_can_be_zero_length),
+ read_backward_(read_backward),
+ traversed_loop_initialization_node_(false),
+ min_loop_iterations_(min_loop_iterations) {}
+ void AddLoopAlternative(GuardedAlternative alt);
+ void AddContinueAlternative(GuardedAlternative alt);
+ void Emit(RegExpCompiler* compiler, Trace* trace) override;
+ void GetQuickCheckDetails(QuickCheckDetails* details,
+ RegExpCompiler* compiler, int characters_filled_in,
+ bool not_at_start) override;
+ void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
+ RegExpCompiler* compiler,
+ int characters_filled_in,
+ bool not_at_start) override;
+ void FillInBMInfo(Isolate* isolate, int offset, int budget,
+ BoyerMooreLookahead* bm, bool not_at_start) override;
+ EatsAtLeastInfo EatsAtLeastFromLoopEntry() override;
+ RegExpNode* loop_node() { return loop_node_; }
+ RegExpNode* continue_node() { return continue_node_; }
+ bool body_can_be_zero_length() { return body_can_be_zero_length_; }
+ int min_loop_iterations() const { return min_loop_iterations_; }
+ bool read_backward() override { return read_backward_; }
+ void Accept(NodeVisitor* visitor) override;
+ RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
+
+ private:
+ // AddAlternative is made private for loop nodes because alternatives
+ // should not be added freely, we need to keep track of which node
+ // goes back to the node itself.
+ void AddAlternative(GuardedAlternative node) {
+ ChoiceNode::AddAlternative(node);
+ }
+
+ RegExpNode* loop_node_;
+ RegExpNode* continue_node_;
+ bool body_can_be_zero_length_;
+ bool read_backward_;
+
+ // Temporary marker set only while generating quick check details. Represents
+ // whether GetQuickCheckDetails traversed the initialization node for this
+ // loop's counter. If so, we may be able to generate stricter quick checks
+ // because we know the loop node must match at least min_loop_iterations_
+ // times before the continuation node can match.
+ bool traversed_loop_initialization_node_;
+
+ // The minimum number of times the loop_node_ must match before the
+ // continue_node_ might be considered. This value can be temporarily decreased
+ // while generating quick check details, to represent the remaining iterations
+ // after the completed portion of the quick check details.
+ int min_loop_iterations_;
+
+ friend class IterationDecrementer;
+ friend class LoopInitializationMarker;
+};
+
+class NodeVisitor {
+ public:
+ virtual ~NodeVisitor() = default;
+#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0;
+ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
+#undef DECLARE_VISIT
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_NODES_H_
diff --git a/js/src/irregexp/imported/regexp-parser.cc b/js/src/irregexp/imported/regexp-parser.cc
new file mode 100644
index 0000000000..57f4c12fc5
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-parser.cc
@@ -0,0 +1,3131 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-parser.h"
+
+#include "irregexp/imported/regexp-ast.h"
+#include "irregexp/imported/regexp-macro-assembler.h"
+#include "irregexp/imported/regexp.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/usetiter.h"
+#include "unicode/utf16.h" // For U16_NEXT
+#endif // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+namespace {
+
+// Whether we're currently inside the ClassEscape production
+// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
+enum class InClassEscapeState {
+ kInClass,
+ kNotInClass,
+};
+
+// The production used to derive ClassSetOperand.
+enum class ClassSetOperandType {
+ kClassSetCharacter,
+ kClassStringDisjunction,
+ kNestedClass,
+ kCharacterClassEscape, // \ CharacterClassEscape is a special nested class,
+ // as we can fold it directly into another range.
+ kClassSetRange
+};
+
+class RegExpTextBuilder {
+ public:
+ using SmallRegExpTreeVector =
+ base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
+
+ RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage,
+ RegExpFlags flags)
+ : zone_(zone),
+ flags_(flags),
+ terms_(terms_storage),
+ text_(ZoneAllocator<RegExpTree*>{zone}) {}
+ void AddCharacter(base::uc16 character);
+ void AddUnicodeCharacter(base::uc32 character);
+ void AddEscapedUnicodeCharacter(base::uc32 character);
+ void AddAtom(RegExpTree* atom);
+ void AddTerm(RegExpTree* term);
+ void AddClassRanges(RegExpClassRanges* cc);
+ void FlushPendingSurrogate();
+ void FlushText();
+ RegExpTree* PopLastAtom();
+ RegExpTree* ToRegExp();
+
+ private:
+ static const base::uc16 kNoPendingSurrogate = 0;
+
+ void AddLeadSurrogate(base::uc16 lead_surrogate);
+ void AddTrailSurrogate(base::uc16 trail_surrogate);
+ void FlushCharacters();
+ bool NeedsDesugaringForUnicode(RegExpClassRanges* cc);
+ bool NeedsDesugaringForIgnoreCase(base::uc32 c);
+ void AddClassRangesForDesugaring(base::uc32 c);
+ bool ignore_case() const { return IsIgnoreCase(flags_); }
+ bool IsUnicodeMode() const {
+ // Either /v or /u enable UnicodeMode
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+ return IsUnicode(flags_) || IsUnicodeSets(flags_);
+ }
+ Zone* zone() const { return zone_; }
+
+ Zone* const zone_;
+ const RegExpFlags flags_;
+ ZoneList<base::uc16>* characters_ = nullptr;
+ base::uc16 pending_surrogate_ = kNoPendingSurrogate;
+ SmallRegExpTreeVector* terms_;
+ SmallRegExpTreeVector text_;
+};
+
+void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ FlushPendingSurrogate();
+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+ pending_surrogate_ = lead_surrogate;
+}
+
+void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ base::uc16 lead_surrogate = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ base::uc32 combined =
+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+ if (NeedsDesugaringForIgnoreCase(combined)) {
+ AddClassRangesForDesugaring(combined);
+ } else {
+ ZoneList<base::uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(lead_surrogate, zone());
+ surrogate_pair.Add(trail_surrogate, zone());
+ RegExpAtom* atom =
+ zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
+ AddAtom(atom);
+ }
+ } else {
+ pending_surrogate_ = trail_surrogate;
+ FlushPendingSurrogate();
+ }
+}
+
+void RegExpTextBuilder::FlushPendingSurrogate() {
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ DCHECK(IsUnicodeMode());
+ base::uc32 c = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ AddClassRangesForDesugaring(c);
+ }
+}
+
+void RegExpTextBuilder::FlushCharacters() {
+ FlushPendingSurrogate();
+ if (characters_ != nullptr) {
+ RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
+ characters_ = nullptr;
+ text_.emplace_back(atom);
+ }
+}
+
+void RegExpTextBuilder::FlushText() {
+ FlushCharacters();
+ size_t num_text = text_.size();
+ if (num_text == 0) {
+ return;
+ } else if (num_text == 1) {
+ terms_->emplace_back(text_.back());
+ } else {
+ RegExpText* text = zone()->New<RegExpText>(zone());
+ for (size_t i = 0; i < num_text; i++) {
+ text_[i]->AppendToText(text, zone());
+ }
+ terms_->emplace_back(text);
+ }
+ text_.clear();
+}
+
+void RegExpTextBuilder::AddCharacter(base::uc16 c) {
+ FlushPendingSurrogate();
+ if (NeedsDesugaringForIgnoreCase(c)) {
+ AddClassRangesForDesugaring(c);
+ } else {
+ if (characters_ == nullptr) {
+ characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
+ }
+ characters_->Add(c, zone());
+ }
+}
+
+void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) {
+ if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+ DCHECK(IsUnicodeMode());
+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+ } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+ AddLeadSurrogate(c);
+ } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+ AddTrailSurrogate(c);
+ } else {
+ AddCharacter(static_cast<base::uc16>(c));
+ }
+}
+
+void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
+ // A lead or trail surrogate parsed via escape sequence will not
+ // pair up with any preceding lead or following trail surrogate.
+ FlushPendingSurrogate();
+ AddUnicodeCharacter(character);
+ FlushPendingSurrogate();
+}
+
+void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) {
+ if (NeedsDesugaringForUnicode(cr)) {
+ // With /u or /v, character class needs to be desugared, so it
+ // must be a standalone term instead of being part of a RegExpText.
+ AddTerm(cr);
+ } else {
+ AddAtom(cr);
+ }
+}
+
+void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) {
+ AddTerm(zone()->New<RegExpClassRanges>(
+ zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
+}
+
+void RegExpTextBuilder::AddAtom(RegExpTree* atom) {
+ DCHECK(atom->IsTextElement());
+ FlushCharacters();
+ text_.emplace_back(atom);
+}
+
+void RegExpTextBuilder::AddTerm(RegExpTree* term) {
+ DCHECK(term->IsTextElement());
+ FlushText();
+ terms_->emplace_back(term);
+}
+
+bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) {
+ if (!IsUnicodeMode()) return false;
+ // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
+ // necessarily mean that we need to desugar. It's probably nicer to have a
+ // separate pass to figure out unicode desugarings.
+ if (ignore_case()) return true;
+ ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+
+ if (cc->is_negated()) {
+ ZoneList<CharacterRange>* negated_ranges =
+ zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());
+ CharacterRange::Negate(ranges, negated_ranges, zone());
+ ranges = negated_ranges;
+ }
+
+ for (int i = ranges->length() - 1; i >= 0; i--) {
+ base::uc32 from = ranges->at(i).from();
+ base::uc32 to = ranges->at(i).to();
+ // Check for non-BMP characters.
+ if (to >= kNonBmpStart) return true;
+ // Check for lone surrogates.
+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+ }
+ return false;
+}
+
+bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
+#ifdef V8_INTL_SUPPORT
+ if (IsUnicodeMode() && ignore_case()) {
+ icu::UnicodeSet set(c, c);
+ set.closeOver(USET_CASE_INSENSITIVE);
+ set.removeAllStrings();
+ return set.size() > 1;
+ }
+ // In the case where ICU is not included, we act as if the unicode flag is
+ // not set, and do not desugar.
+#endif // V8_INTL_SUPPORT
+ return false;
+}
+
+RegExpTree* RegExpTextBuilder::PopLastAtom() {
+ FlushPendingSurrogate();
+ RegExpTree* atom;
+ if (characters_ != nullptr) {
+ base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
+ int num_chars = char_vector.length();
+ if (num_chars > 1) {
+ base::Vector<const base::uc16> prefix =
+ char_vector.SubVector(0, num_chars - 1);
+ text_.emplace_back(zone()->New<RegExpAtom>(prefix));
+ char_vector = char_vector.SubVector(num_chars - 1, num_chars);
+ }
+ characters_ = nullptr;
+ atom = zone()->New<RegExpAtom>(char_vector);
+ return atom;
+ } else if (text_.size() > 0) {
+ atom = text_.back();
+ text_.pop_back();
+ return atom;
+ }
+ return nullptr;
+}
+
+RegExpTree* RegExpTextBuilder::ToRegExp() {
+ FlushText();
+ size_t num_alternatives = terms_->size();
+ if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
+ if (num_alternatives == 1) return terms_->back();
+ return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
+ base::VectorOf(terms_->begin(), terms_->size()), zone()));
+}
+
+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
+class RegExpBuilder {
+ public:
+ RegExpBuilder(Zone* zone, RegExpFlags flags)
+ : zone_(zone),
+ flags_(flags),
+ terms_(ZoneAllocator<RegExpTree*>{zone}),
+ alternatives_(ZoneAllocator<RegExpTree*>{zone}),
+ text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {}
+ void AddCharacter(base::uc16 character);
+ void AddUnicodeCharacter(base::uc32 character);
+ void AddEscapedUnicodeCharacter(base::uc32 character);
+ // "Adds" an empty expression. Does nothing except consume a
+ // following quantifier
+ void AddEmpty();
+ void AddClassRanges(RegExpClassRanges* cc);
+ void AddAtom(RegExpTree* tree);
+ void AddTerm(RegExpTree* tree);
+ void AddAssertion(RegExpTree* tree);
+ void NewAlternative(); // '|'
+ bool AddQuantifierToAtom(int min, int max,
+ RegExpQuantifier::QuantifierType type);
+ void FlushText();
+ RegExpTree* ToRegExp();
+ RegExpFlags flags() const { return flags_; }
+
+ bool ignore_case() const { return IsIgnoreCase(flags_); }
+ bool multiline() const { return IsMultiline(flags_); }
+ bool dotall() const { return IsDotAll(flags_); }
+
+ private:
+ void FlushTerms();
+ bool IsUnicodeMode() const {
+ // Either /v or /u enable UnicodeMode
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+ return IsUnicode(flags_) || IsUnicodeSets(flags_);
+ }
+ Zone* zone() const { return zone_; }
+ RegExpTextBuilder& text_builder() { return text_builder_; }
+
+ Zone* const zone_;
+ bool pending_empty_ = false;
+ const RegExpFlags flags_;
+
+ using SmallRegExpTreeVector =
+ base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
+ SmallRegExpTreeVector terms_;
+ SmallRegExpTreeVector alternatives_;
+ RegExpTextBuilder text_builder_;
+};
+
+enum SubexpressionType {
+ INITIAL,
+ CAPTURE, // All positive values represent captures.
+ POSITIVE_LOOKAROUND,
+ NEGATIVE_LOOKAROUND,
+ GROUPING
+};
+
+class RegExpParserState : public ZoneObject {
+ public:
+ // Push a state on the stack.
+ RegExpParserState(RegExpParserState* previous_state,
+ SubexpressionType group_type,
+ RegExpLookaround::Type lookaround_type,
+ int disjunction_capture_index,
+ const ZoneVector<base::uc16>* capture_name,
+ RegExpFlags flags, Zone* zone)
+ : previous_state_(previous_state),
+ builder_(zone, flags),
+ group_type_(group_type),
+ lookaround_type_(lookaround_type),
+ disjunction_capture_index_(disjunction_capture_index),
+ capture_name_(capture_name) {}
+ // Parser state of containing expression, if any.
+ RegExpParserState* previous_state() const { return previous_state_; }
+ bool IsSubexpression() { return previous_state_ != nullptr; }
+ // RegExpBuilder building this regexp's AST.
+ RegExpBuilder* builder() { return &builder_; }
+ // Type of regexp being parsed (parenthesized group or entire regexp).
+ SubexpressionType group_type() const { return group_type_; }
+ // Lookahead or Lookbehind.
+ RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
+ // Index in captures array of first capture in this sub-expression, if any.
+ // Also the capture index of this sub-expression itself, if group_type
+ // is CAPTURE.
+ int capture_index() const { return disjunction_capture_index_; }
+ // The name of the current sub-expression, if group_type is CAPTURE. Only
+ // used for named captures.
+ const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
+
+ bool IsNamedCapture() const { return capture_name_ != nullptr; }
+
+ // Check whether the parser is inside a capture group with the given index.
+ bool IsInsideCaptureGroup(int index) const {
+ for (const RegExpParserState* s = this; s != nullptr;
+ s = s->previous_state()) {
+ if (s->group_type() != CAPTURE) continue;
+ // Return true if we found the matching capture index.
+ if (index == s->capture_index()) return true;
+ // Abort if index is larger than what has been parsed up till this state.
+ if (index > s->capture_index()) return false;
+ }
+ return false;
+ }
+
+ // Check whether the parser is inside a capture group with the given name.
+ bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
+ DCHECK_NOT_NULL(name);
+ for (const RegExpParserState* s = this; s != nullptr;
+ s = s->previous_state()) {
+ if (s->capture_name() == nullptr) continue;
+ if (*s->capture_name() == *name) return true;
+ }
+ return false;
+ }
+
+ private:
+ // Linked list implementation of stack of states.
+ RegExpParserState* const previous_state_;
+ // Builder for the stored disjunction.
+ RegExpBuilder builder_;
+ // Stored disjunction type (capture, look-ahead or grouping), if any.
+ const SubexpressionType group_type_;
+ // Stored read direction.
+ const RegExpLookaround::Type lookaround_type_;
+ // Stored disjunction's capture index (if any).
+ const int disjunction_capture_index_;
+ // Stored capture name (if any).
+ const ZoneVector<base::uc16>* const capture_name_;
+};
+
+template <class CharT>
+class RegExpParserImpl final {
+ private:
+ RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
+ uintptr_t stack_limit, Zone* zone,
+ const DisallowGarbageCollection& no_gc);
+
+ bool Parse(RegExpCompileData* result);
+
+ RegExpTree* ParsePattern();
+ RegExpTree* ParseDisjunction();
+ RegExpTree* ParseGroup();
+
+ // Parses a {...,...} quantifier and stores the range in the given
+ // out parameters.
+ bool ParseIntervalQuantifier(int* min_out, int* max_out);
+
+ // Checks whether the following is a length-digit hexadecimal number,
+ // and sets the value if it is.
+ bool ParseHexEscape(int length, base::uc32* value);
+ bool ParseUnicodeEscape(base::uc32* value);
+ bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
+
+ bool ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2);
+ bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range,
+ CharacterClassStrings* add_to_strings, bool negate,
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2);
+
+ RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges,
+ bool add_unicode_case_equivalents);
+ // Parse inside a class. Either add escaped class to the range, or return
+ // false and pass parsed single character through |char_out|.
+ void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
+ bool add_unicode_case_equivalents, base::uc32* char_out,
+ bool* is_class_escape);
+ // Returns true iff parsing was successful.
+ bool TryParseCharacterClassEscape(base::uc32 next,
+ InClassEscapeState in_class_escape_state,
+ ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings, Zone* zone,
+ bool add_unicode_case_equivalents);
+ RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings);
+ RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
+ ClassSetOperandType* type_out);
+ RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
+ ClassSetOperandType* type_out,
+ ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings);
+ base::uc32 ParseClassSetCharacter();
+ // Parses and returns a single escaped character.
+ base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
+ bool* is_escaped_unicode_character);
+
+ RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,
+ RegExpTree* first_operand,
+ ClassSetOperandType first_operand_type,
+ ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings);
+ RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,
+ bool is_negated, RegExpTree* first_operand,
+ ClassSetOperandType first_operand_type);
+ RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder,
+ bool is_negated, RegExpTree* first_operand,
+ ClassSetOperandType first_operand_type);
+ RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
+
+ base::uc32 ParseOctalLiteral();
+
+ // Tries to parse the input as a back reference. If successful it
+ // stores the result in the output parameter and returns true. If
+ // it fails it will push back the characters read so the same characters
+ // can be reparsed.
+ bool ParseBackReferenceIndex(int* index_out);
+
+ RegExpTree* ReportError(RegExpError error);
+ void Advance();
+ void Advance(int dist);
+ void RewindByOneCodepoint(); // Rewinds to before the previous Advance().
+ void Reset(int pos);
+
+ // Reports whether the pattern might be used as a literal search string.
+ // Only use if the result of the parse is a single atom node.
+ bool simple() const { return simple_; }
+ bool contains_anchor() const { return contains_anchor_; }
+ void set_contains_anchor() { contains_anchor_ = true; }
+ int captures_started() const { return captures_started_; }
+ int position() const { return next_pos_ - 1; }
+ bool failed() const { return failed_; }
+ RegExpFlags flags() const { return top_level_flags_; }
+ bool IsUnicodeMode() const {
+ // Either /v or /u enable UnicodeMode
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern
+ return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;
+ }
+ bool unicode_sets() const { return IsUnicodeSets(flags()); }
+ bool ignore_case() const { return IsIgnoreCase(flags()); }
+
+ static bool IsSyntaxCharacterOrSlash(base::uc32 c);
+ static bool IsClassSetSyntaxCharacter(base::uc32 c);
+ static bool IsClassSetReservedPunctuator(base::uc32 c);
+ bool IsClassSetReservedDoublePunctuator(base::uc32 c);
+
+ static const base::uc32 kEndMarker = (1 << 21);
+
+ private:
+ // Return the 1-indexed RegExpCapture object, allocate if necessary.
+ RegExpCapture* GetCapture(int index);
+
+ // Creates a new named capture at the specified index. Must be called exactly
+ // once for each named capture. Fails if a capture with the same name is
+ // encountered.
+ bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
+
+ // Parses the name of a capture group (?<name>pattern). The name must adhere
+ // to IdentifierName in the ECMAScript standard.
+ const ZoneVector<base::uc16>* ParseCaptureGroupName();
+
+ bool ParseNamedBackReference(RegExpBuilder* builder,
+ RegExpParserState* state);
+ RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
+
+ // After the initial parsing pass, patch corresponding RegExpCapture objects
+ // into all RegExpBackReferences. This is done after initial parsing in order
+ // to avoid complicating cases in which references comes before the capture.
+ void PatchNamedBackReferences();
+
+ ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
+
+ // Returns true iff the pattern contains named captures. May call
+ // ScanForCaptures to look ahead at the remaining pattern.
+ bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
+
+ Zone* zone() const { return zone_; }
+
+ base::uc32 current() const { return current_; }
+ bool has_more() const { return has_more_; }
+ bool has_next() const { return next_pos_ < input_length(); }
+ base::uc32 Next();
+ template <bool update_position>
+ base::uc32 ReadNext();
+ CharT InputAt(int index) const {
+ DCHECK(0 <= index && index < input_length());
+ return input_[index];
+ }
+ int input_length() const { return input_length_; }
+ void ScanForCaptures(InClassEscapeState in_class_escape_state);
+
+ struct RegExpCaptureNameLess {
+ bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
+ DCHECK_NOT_NULL(lhs);
+ DCHECK_NOT_NULL(rhs);
+ return *lhs->name() < *rhs->name();
+ }
+ };
+
+ class ForceUnicodeScope final {
+ public:
+ explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
+ : parser_(parser) {
+ DCHECK(!parser_->force_unicode_);
+ parser_->force_unicode_ = true;
+ }
+ ~ForceUnicodeScope() {
+ DCHECK(parser_->force_unicode_);
+ parser_->force_unicode_ = false;
+ }
+
+ private:
+ RegExpParserImpl<CharT>* const parser_;
+ };
+
+ const DisallowGarbageCollection no_gc_;
+ Zone* const zone_;
+ RegExpError error_ = RegExpError::kNone;
+ int error_pos_ = 0;
+ ZoneList<RegExpCapture*>* captures_;
+ ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
+ ZoneList<RegExpBackReference*>* named_back_references_;
+ const CharT* const input_;
+ const int input_length_;
+ base::uc32 current_;
+ const RegExpFlags top_level_flags_;
+ bool force_unicode_ = false; // Force parser to act as if unicode were set.
+ int next_pos_;
+ int captures_started_;
+ int capture_count_; // Only valid after we have scanned for captures.
+ bool has_more_;
+ bool simple_;
+ bool contains_anchor_;
+ bool is_scanned_for_captures_;
+ bool has_named_captures_; // Only valid after we have scanned for captures.
+ bool failed_;
+ const uintptr_t stack_limit_;
+
+ friend class v8::internal::RegExpParser;
+};
+
+template <class CharT>
+RegExpParserImpl<CharT>::RegExpParserImpl(
+ const CharT* input, int input_length, RegExpFlags flags,
+ uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
+ : zone_(zone),
+ captures_(nullptr),
+ named_captures_(nullptr),
+ named_back_references_(nullptr),
+ input_(input),
+ input_length_(input_length),
+ current_(kEndMarker),
+ top_level_flags_(flags),
+ next_pos_(0),
+ captures_started_(0),
+ capture_count_(0),
+ has_more_(true),
+ simple_(false),
+ contains_anchor_(false),
+ is_scanned_for_captures_(false),
+ has_named_captures_(false),
+ failed_(false),
+ stack_limit_(stack_limit) {
+ Advance();
+}
+
+template <>
+template <bool update_position>
+inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
+ int position = next_pos_;
+ base::uc16 c0 = InputAt(position);
+ position++;
+ DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
+ if (update_position) next_pos_ = position;
+ return c0;
+}
+
+template <>
+template <bool update_position>
+inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
+ int position = next_pos_;
+ base::uc16 c0 = InputAt(position);
+ base::uc32 result = c0;
+ position++;
+ // Read the whole surrogate pair in case of unicode mode, if possible.
+ if (IsUnicodeMode() && position < input_length() &&
+ unibrow::Utf16::IsLeadSurrogate(c0)) {
+ base::uc16 c1 = InputAt(position);
+ if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+ result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
+ position++;
+ }
+ }
+ if (update_position) next_pos_ = position;
+ return result;
+}
+
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::Next() {
+ if (has_next()) {
+ return ReadNext<false>();
+ } else {
+ return kEndMarker;
+ }
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::Advance() {
+ if (has_next()) {
+ if (GetCurrentStackPosition() < stack_limit_) {
+ if (v8_flags.correctness_fuzzer_suppressions) {
+ FATAL("Aborting on stack overflow");
+ }
+ ReportError(RegExpError::kStackOverflow);
+ } else {
+ current_ = ReadNext<true>();
+ }
+ } else {
+ current_ = kEndMarker;
+ // Advance so that position() points to 1-after-the-last-character. This is
+ // important so that Reset() to this position works correctly.
+ next_pos_ = input_length() + 1;
+ has_more_ = false;
+ }
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
+ if (!has_more()) return;
+ // Rewinds by one code point, i.e.: two code units if `current` is outside
+ // the basic multilingual plane (= composed of a lead and trail surrogate),
+ // or one code unit otherwise.
+ const int rewind_by =
+ current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
+ Advance(rewind_by); // Undo the last Advance.
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::Reset(int pos) {
+ next_pos_ = pos;
+ has_more_ = (pos < input_length());
+ Advance();
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::Advance(int dist) {
+ next_pos_ += dist - 1;
+ Advance();
+}
+
+// static
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
+ switch (c) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '/':
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+// static
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) {
+ switch (c) {
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '/':
+ case '-':
+ case '\\':
+ case '|':
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+// static
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) {
+ switch (c) {
+ case '&':
+ case '-':
+ case '!':
+ case '#':
+ case '%':
+ case ',':
+ case ':':
+ case ';':
+ case '<':
+ case '=':
+ case '>':
+ case '@':
+ case '`':
+ case '~':
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) {
+#define DOUBLE_PUNCTUATOR_CASE(Char) \
+ case Char: \
+ return Next() == Char
+
+ switch (c) {
+ DOUBLE_PUNCTUATOR_CASE('&');
+ DOUBLE_PUNCTUATOR_CASE('!');
+ DOUBLE_PUNCTUATOR_CASE('#');
+ DOUBLE_PUNCTUATOR_CASE('$');
+ DOUBLE_PUNCTUATOR_CASE('%');
+ DOUBLE_PUNCTUATOR_CASE('*');
+ DOUBLE_PUNCTUATOR_CASE('+');
+ DOUBLE_PUNCTUATOR_CASE(',');
+ DOUBLE_PUNCTUATOR_CASE('.');
+ DOUBLE_PUNCTUATOR_CASE(':');
+ DOUBLE_PUNCTUATOR_CASE(';');
+ DOUBLE_PUNCTUATOR_CASE('<');
+ DOUBLE_PUNCTUATOR_CASE('=');
+ DOUBLE_PUNCTUATOR_CASE('>');
+ DOUBLE_PUNCTUATOR_CASE('?');
+ DOUBLE_PUNCTUATOR_CASE('@');
+ DOUBLE_PUNCTUATOR_CASE('^');
+ DOUBLE_PUNCTUATOR_CASE('`');
+ DOUBLE_PUNCTUATOR_CASE('~');
+ default:
+ break;
+ }
+#undef DOUBLE_PUNCTUATOR_CASE
+
+ return false;
+}
+
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
+ if (failed_) return nullptr; // Do not overwrite any existing error.
+ failed_ = true;
+ error_ = error;
+ error_pos_ = position();
+ // Zip to the end to make sure no more input is read.
+ current_ = kEndMarker;
+ next_pos_ = input_length();
+ has_more_ = false;
+ return nullptr;
+}
+
+#define CHECK_FAILED /**/); \
+ if (failed_) return nullptr; \
+ ((void)0
+
+// Pattern ::
+// Disjunction
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
+ RegExpTree* result = ParseDisjunction(CHECK_FAILED);
+ PatchNamedBackReferences(CHECK_FAILED);
+ DCHECK(!has_more());
+ // If the result of parsing is a literal string atom, and it has the
+ // same length as the input, then the atom is identical to the input.
+ if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
+ simple_ = true;
+ }
+ return result;
+}
+
+// Disjunction ::
+// Alternative
+// Alternative | Disjunction
+// Alternative ::
+// [empty]
+// Term Alternative
+// Term ::
+// Assertion
+// Atom
+// Atom Quantifier
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
+ // Used to store current state while parsing subexpressions.
+ RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
+ 0, nullptr, flags(), zone());
+ RegExpParserState* state = &initial_state;
+ // Cache the builder in a local variable for quick access.
+ RegExpBuilder* builder = initial_state.builder();
+ while (true) {
+ switch (current()) {
+ case kEndMarker:
+ if (failed()) return nullptr; // E.g. the initial Advance failed.
+ if (state->IsSubexpression()) {
+ // Inside a parenthesized group when hitting end of input.
+ return ReportError(RegExpError::kUnterminatedGroup);
+ }
+ DCHECK_EQ(INITIAL, state->group_type());
+ // Parsing completed successfully.
+ return builder->ToRegExp();
+ case ')': {
+ if (!state->IsSubexpression()) {
+ return ReportError(RegExpError::kUnmatchedParen);
+ }
+ DCHECK_NE(INITIAL, state->group_type());
+
+ Advance();
+ // End disjunction parsing and convert builder content to new single
+ // regexp atom.
+ RegExpTree* body = builder->ToRegExp();
+
+ int end_capture_index = captures_started();
+
+ int capture_index = state->capture_index();
+ SubexpressionType group_type = state->group_type();
+
+ // Build result of subexpression.
+ if (group_type == CAPTURE) {
+ if (state->IsNamedCapture()) {
+ CreateNamedCaptureAtIndex(state->capture_name(),
+ capture_index CHECK_FAILED);
+ }
+ RegExpCapture* capture = GetCapture(capture_index);
+ capture->set_body(body);
+ body = capture;
+ } else if (group_type == GROUPING) {
+ body = zone()->template New<RegExpGroup>(body);
+ } else {
+ DCHECK(group_type == POSITIVE_LOOKAROUND ||
+ group_type == NEGATIVE_LOOKAROUND);
+ bool is_positive = (group_type == POSITIVE_LOOKAROUND);
+ body = zone()->template New<RegExpLookaround>(
+ body, is_positive, end_capture_index - capture_index,
+ capture_index, state->lookaround_type());
+ }
+
+ // Restore previous state.
+ state = state->previous_state();
+ builder = state->builder();
+
+ builder->AddAtom(body);
+ // For compatibility with JSC and ES3, we allow quantifiers after
+ // lookaheads, and break in all cases.
+ break;
+ }
+ case '|': {
+ Advance();
+ builder->NewAlternative();
+ continue;
+ }
+ case '*':
+ case '+':
+ case '?':
+ return ReportError(RegExpError::kNothingToRepeat);
+ case '^': {
+ Advance();
+ builder->AddAssertion(zone()->template New<RegExpAssertion>(
+ builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
+ : RegExpAssertion::Type::START_OF_INPUT));
+ set_contains_anchor();
+ continue;
+ }
+ case '$': {
+ Advance();
+ RegExpAssertion::Type assertion_type =
+ builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
+ : RegExpAssertion::Type::END_OF_INPUT;
+ builder->AddAssertion(
+ zone()->template New<RegExpAssertion>(assertion_type));
+ continue;
+ }
+ case '.': {
+ Advance();
+ ZoneList<CharacterRange>* ranges =
+ zone()->template New<ZoneList<CharacterRange>>(2, zone());
+
+ if (builder->dotall()) {
+ // Everything.
+ CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
+ ranges, false, zone());
+ } else {
+ // Everything except \x0A, \x0D, \u2028 and \u2029.
+ CharacterRange::AddClassEscape(
+ StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
+ }
+
+ RegExpClassRanges* cc =
+ zone()->template New<RegExpClassRanges>(zone(), ranges);
+ builder->AddClassRanges(cc);
+ break;
+ }
+ case '(': {
+ state = ParseOpenParenthesis(state CHECK_FAILED);
+ builder = state->builder();
+ continue;
+ }
+ case '[': {
+ RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
+ if (cc->IsClassRanges()) {
+ builder->AddClassRanges(cc->AsClassRanges());
+ } else {
+ DCHECK(cc->IsClassSetExpression());
+ builder->AddTerm(cc);
+ }
+ break;
+ }
+ // Atom ::
+ // \ AtomEscape
+ case '\\':
+ switch (Next()) {
+ case kEndMarker:
+ return ReportError(RegExpError::kEscapeAtEndOfPattern);
+ // AtomEscape ::
+ // [+UnicodeMode] DecimalEscape
+ // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
+ // of DecimalEscape is ≤ NcapturingParens
+ // CharacterEscape (some cases of this mixed in too)
+ //
+ // TODO(jgruber): It may make sense to disentangle all the different
+ // cases and make the structure mirror the spec, e.g. for AtomEscape:
+ //
+ // if (TryParseDecimalEscape(...)) return;
+ // if (TryParseCharacterClassEscape(...)) return;
+ // if (TryParseCharacterEscape(...)) return;
+ // if (TryParseGroupName(...)) return;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ int index = 0;
+ const bool is_backref =
+ ParseBackReferenceIndex(&index CHECK_FAILED);
+ if (is_backref) {
+ if (state->IsInsideCaptureGroup(index)) {
+ // The back reference is inside the capture group it refers to.
+ // Nothing can possibly have been captured yet, so we use empty
+ // instead. This ensures that, when checking a back reference,
+ // the capture registers of the referenced capture are either
+ // both set or both cleared.
+ builder->AddEmpty();
+ } else {
+ RegExpCapture* capture = GetCapture(index);
+ RegExpTree* atom = zone()->template New<RegExpBackReference>(
+ capture, builder->flags());
+ builder->AddAtom(atom);
+ }
+ break;
+ }
+ // With /u and /v, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (IsUnicodeMode()) {
+ return ReportError(RegExpError::kInvalidEscape);
+ }
+ base::uc32 first_digit = Next();
+ if (first_digit == '8' || first_digit == '9') {
+ builder->AddCharacter(first_digit);
+ Advance(2);
+ break;
+ }
+ V8_FALLTHROUGH;
+ }
+ case '0': {
+ Advance();
+ if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') {
+ // Decimal escape with leading 0 are not parsed as octal.
+ return ReportError(RegExpError::kInvalidDecimalEscape);
+ }
+ base::uc32 octal = ParseOctalLiteral();
+ builder->AddCharacter(octal);
+ break;
+ }
+ case 'b':
+ Advance(2);
+ builder->AddAssertion(zone()->template New<RegExpAssertion>(
+ RegExpAssertion::Type::BOUNDARY));
+ continue;
+ case 'B':
+ Advance(2);
+ builder->AddAssertion(zone()->template New<RegExpAssertion>(
+ RegExpAssertion::Type::NON_BOUNDARY));
+ continue;
+ // AtomEscape ::
+ // CharacterClassEscape
+ case 'd':
+ case 'D':
+ case 's':
+ case 'S':
+ case 'w':
+ case 'W': {
+ base::uc32 next = Next();
+ ZoneList<CharacterRange>* ranges =
+ zone()->template New<ZoneList<CharacterRange>>(2, zone());
+ bool add_unicode_case_equivalents =
+ IsUnicodeMode() && ignore_case();
+ bool parsed_character_class_escape = TryParseCharacterClassEscape(
+ next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(),
+ add_unicode_case_equivalents CHECK_FAILED);
+
+ if (parsed_character_class_escape) {
+ RegExpClassRanges* cc =
+ zone()->template New<RegExpClassRanges>(zone(), ranges);
+ builder->AddClassRanges(cc);
+ } else {
+ CHECK(!IsUnicodeMode());
+ Advance(2);
+ builder->AddCharacter(next); // IdentityEscape.
+ }
+ break;
+ }
+ case 'p':
+ case 'P': {
+ base::uc32 next = Next();
+ ZoneList<CharacterRange>* ranges =
+ zone()->template New<ZoneList<CharacterRange>>(2, zone());
+ CharacterClassStrings* strings = nullptr;
+ if (unicode_sets()) {
+ strings = zone()->template New<CharacterClassStrings>(zone());
+ }
+ bool add_unicode_case_equivalents = ignore_case();
+ bool parsed_character_class_escape = TryParseCharacterClassEscape(
+ next, InClassEscapeState::kNotInClass, ranges, strings, zone(),
+ add_unicode_case_equivalents CHECK_FAILED);
+
+ if (parsed_character_class_escape) {
+ if (unicode_sets()) {
+ RegExpClassSetOperand* op =
+ zone()->template New<RegExpClassSetOperand>(ranges,
+ strings);
+ builder->AddTerm(op);
+ } else {
+ RegExpClassRanges* cc =
+ zone()->template New<RegExpClassRanges>(zone(), ranges);
+ builder->AddClassRanges(cc);
+ }
+ } else {
+ CHECK(!IsUnicodeMode());
+ Advance(2);
+ builder->AddCharacter(next); // IdentityEscape.
+ }
+ break;
+ }
+ // AtomEscape ::
+ // k GroupName
+ case 'k': {
+ // Either an identity escape or a named back-reference. The two
+ // interpretations are mutually exclusive: '\k' is interpreted as
+ // an identity escape for non-Unicode patterns without named
+ // capture groups, and as the beginning of a named back-reference
+ // in all other cases.
+ const bool has_named_captures =
+ HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
+ if (IsUnicodeMode() || has_named_captures) {
+ Advance(2);
+ ParseNamedBackReference(builder, state CHECK_FAILED);
+ break;
+ }
+ }
+ V8_FALLTHROUGH;
+ // AtomEscape ::
+ // CharacterEscape
+ default: {
+ bool is_escaped_unicode_character = false;
+ base::uc32 c = ParseCharacterEscape(
+ InClassEscapeState::kNotInClass,
+ &is_escaped_unicode_character CHECK_FAILED);
+ if (is_escaped_unicode_character) {
+ builder->AddEscapedUnicodeCharacter(c);
+ } else {
+ builder->AddCharacter(c);
+ }
+ break;
+ }
+ }
+ break;
+ case '{': {
+ int dummy;
+ bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
+ if (parsed) return ReportError(RegExpError::kNothingToRepeat);
+ V8_FALLTHROUGH;
+ }
+ case '}':
+ case ']':
+ if (IsUnicodeMode()) {
+ return ReportError(RegExpError::kLoneQuantifierBrackets);
+ }
+ V8_FALLTHROUGH;
+ default:
+ builder->AddUnicodeCharacter(current());
+ Advance();
+ break;
+ } // end switch(current())
+
+ int min;
+ int max;
+ switch (current()) {
+ // QuantifierPrefix ::
+ // *
+ // +
+ // ?
+ // {
+ case '*':
+ min = 0;
+ max = RegExpTree::kInfinity;
+ Advance();
+ break;
+ case '+':
+ min = 1;
+ max = RegExpTree::kInfinity;
+ Advance();
+ break;
+ case '?':
+ min = 0;
+ max = 1;
+ Advance();
+ break;
+ case '{':
+ if (ParseIntervalQuantifier(&min, &max)) {
+ if (max < min) {
+ return ReportError(RegExpError::kRangeOutOfOrder);
+ }
+ break;
+ } else if (IsUnicodeMode()) {
+ // Incomplete quantifiers are not allowed.
+ return ReportError(RegExpError::kIncompleteQuantifier);
+ }
+ continue;
+ default:
+ continue;
+ }
+ RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
+ if (current() == '?') {
+ quantifier_type = RegExpQuantifier::NON_GREEDY;
+ Advance();
+ } else if (v8_flags.regexp_possessive_quantifier && current() == '+') {
+ // v8_flags.regexp_possessive_quantifier is a debug-only flag.
+ quantifier_type = RegExpQuantifier::POSSESSIVE;
+ Advance();
+ }
+ if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+ return ReportError(RegExpError::kInvalidQuantifier);
+ }
+ }
+}
+
+template <class CharT>
+RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
+ RegExpParserState* state) {
+ RegExpLookaround::Type lookaround_type = state->lookaround_type();
+ bool is_named_capture = false;
+ const ZoneVector<base::uc16>* capture_name = nullptr;
+ SubexpressionType subexpr_type = CAPTURE;
+ Advance();
+ if (current() == '?') {
+ switch (Next()) {
+ case ':':
+ Advance(2);
+ subexpr_type = GROUPING;
+ break;
+ case '=':
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKAHEAD;
+ subexpr_type = POSITIVE_LOOKAROUND;
+ break;
+ case '!':
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKAHEAD;
+ subexpr_type = NEGATIVE_LOOKAROUND;
+ break;
+ case '<':
+ Advance();
+ if (Next() == '=') {
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ subexpr_type = POSITIVE_LOOKAROUND;
+ break;
+ } else if (Next() == '!') {
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ subexpr_type = NEGATIVE_LOOKAROUND;
+ break;
+ }
+ is_named_capture = true;
+ has_named_captures_ = true;
+ Advance();
+ break;
+ default:
+ ReportError(RegExpError::kInvalidGroup);
+ return nullptr;
+ }
+ }
+ if (subexpr_type == CAPTURE) {
+ if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) {
+ ReportError(RegExpError::kTooManyCaptures);
+ return nullptr;
+ }
+ captures_started_++;
+
+ if (is_named_capture) {
+ capture_name = ParseCaptureGroupName(CHECK_FAILED);
+ }
+ }
+ // Store current state and begin new disjunction parsing.
+ return zone()->template New<RegExpParserState>(
+ state, subexpr_type, lookaround_type, captures_started_, capture_name,
+ state->builder()->flags(), zone());
+}
+
+// In order to know whether an escape is a backreference or not we have to scan
+// the entire regexp and find the number of capturing parentheses. However we
+// don't want to scan the regexp twice unless it is necessary. This mini-parser
+// is called when needed. It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and backslash-escaped
+// characters.
+//
+// Important: The scanner has to be in a consistent state when calling
+// ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while
+// parsing a nested class.
+template <class CharT>
+void RegExpParserImpl<CharT>::ScanForCaptures(
+ InClassEscapeState in_class_escape_state) {
+ DCHECK(!is_scanned_for_captures_);
+ const int saved_position = position();
+ // Start with captures started previous to current position
+ int capture_count = captures_started();
+ // When we start inside a character class, skip everything inside the class.
+ if (in_class_escape_state == InClassEscapeState::kInClass) {
+ // \k is always invalid within a class in unicode mode, thus we should never
+ // call ScanForCaptures within a class.
+ DCHECK(!IsUnicodeMode());
+ int c;
+ while ((c = current()) != kEndMarker) {
+ Advance();
+ if (c == '\\') {
+ Advance();
+ } else {
+ if (c == ']') break;
+ }
+ }
+ }
+ // Add count of captures after this position.
+ int n;
+ while ((n = current()) != kEndMarker) {
+ Advance();
+ switch (n) {
+ case '\\':
+ Advance();
+ break;
+ case '[': {
+ int class_nest_level = 0;
+ int c;
+ while ((c = current()) != kEndMarker) {
+ Advance();
+ if (c == '\\') {
+ Advance();
+ } else if (c == '[') {
+ // With /v, '[' inside a class is treated as a nested class.
+ // Without /v, '[' is a normal character.
+ if (unicode_sets()) class_nest_level++;
+ } else if (c == ']') {
+ if (class_nest_level == 0) break;
+ class_nest_level--;
+ }
+ }
+ break;
+ }
+ case '(':
+ if (current() == '?') {
+ // At this point we could be in
+ // * a non-capturing group '(:',
+ // * a lookbehind assertion '(?<=' '(?<!'
+ // * or a named capture '(?<'.
+ //
+ // Of these, only named captures are capturing groups.
+
+ Advance();
+ if (current() != '<') break;
+
+ Advance();
+ if (current() == '=' || current() == '!') break;
+
+ // Found a possible named capture. It could turn out to be a syntax
+ // error (e.g. an unterminated or invalid name), but that distinction
+ // does not matter for our purposes.
+ has_named_captures_ = true;
+ }
+ capture_count++;
+ break;
+ }
+ }
+ capture_count_ = capture_count;
+ is_scanned_for_captures_ = true;
+ Reset(saved_position);
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
+ DCHECK_EQ('\\', current());
+ DCHECK('1' <= Next() && Next() <= '9');
+ // Try to parse a decimal literal that is no greater than the total number
+ // of left capturing parentheses in the input.
+ int start = position();
+ int value = Next() - '0';
+ Advance(2);
+ while (true) {
+ base::uc32 c = current();
+ if (IsDecimalDigit(c)) {
+ value = 10 * value + (c - '0');
+ if (value > RegExpMacroAssembler::kMaxCaptures) {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ } else {
+ break;
+ }
+ }
+ if (value > captures_started()) {
+ if (!is_scanned_for_captures_) {
+ ScanForCaptures(InClassEscapeState::kNotInClass);
+ }
+ if (value > capture_count_) {
+ Reset(start);
+ return false;
+ }
+ }
+ *index_out = value;
+ return true;
+}
+
+namespace {
+
+void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
+ if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ v->push_back(code_unit);
+ } else {
+ v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
+ v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
+ }
+}
+
+} // namespace
+
+template <class CharT>
+const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
+ // Due to special Advance requirements (see the next comment), rewind by one
+ // such that names starting with a surrogate pair are parsed correctly for
+ // patterns where the unicode flag is unset.
+ //
+ // Note that we use this odd pattern of rewinding the last advance in order
+ // to adhere to the common parser behavior of expecting `current` to point at
+ // the first candidate character for a function (e.g. when entering ParseFoo,
+ // `current` should point at the first character of Foo).
+ RewindByOneCodepoint();
+
+ ZoneVector<base::uc16>* name =
+ zone()->template New<ZoneVector<base::uc16>>(zone());
+
+ {
+ // Advance behavior inside this function is tricky since
+ // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
+ // and thus allows surrogate pairs and \u{}-style escapes even in
+ // non-unicode patterns. Therefore Advance within the capture group name
+ // has to force-enable unicode, and outside the name revert to default
+ // behavior.
+ ForceUnicodeScope force_unicode(this);
+
+ bool at_start = true;
+ while (true) {
+ Advance();
+ base::uc32 c = current();
+
+ // Convert unicode escapes.
+ if (c == '\\' && Next() == 'u') {
+ Advance(2);
+ if (!ParseUnicodeEscape(&c)) {
+ ReportError(RegExpError::kInvalidUnicodeEscape);
+ return nullptr;
+ }
+ RewindByOneCodepoint();
+ }
+
+ // The backslash char is misclassified as both ID_Start and ID_Continue.
+ if (c == '\\') {
+ ReportError(RegExpError::kInvalidCaptureGroupName);
+ return nullptr;
+ }
+
+ if (at_start) {
+ if (!IsIdentifierStart(c)) {
+ ReportError(RegExpError::kInvalidCaptureGroupName);
+ return nullptr;
+ }
+ push_code_unit(name, c);
+ at_start = false;
+ } else {
+ if (c == '>') {
+ break;
+ } else if (IsIdentifierPart(c)) {
+ push_code_unit(name, c);
+ } else {
+ ReportError(RegExpError::kInvalidCaptureGroupName);
+ return nullptr;
+ }
+ }
+ }
+ }
+
+ // This final advance goes back into the state of pointing at the next
+ // relevant char, which the rest of the parser expects. See also the previous
+ // comments in this function.
+ Advance();
+ return name;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
+ const ZoneVector<base::uc16>* name, int index) {
+ DCHECK(0 < index && index <= captures_started_);
+ DCHECK_NOT_NULL(name);
+
+ RegExpCapture* capture = GetCapture(index);
+ DCHECK_NULL(capture->name());
+
+ capture->set_name(name);
+
+ if (named_captures_ == nullptr) {
+ named_captures_ =
+ zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
+ zone());
+ } else {
+ // Check for duplicates and bail if we find any.
+
+ const auto& named_capture_it = named_captures_->find(capture);
+ if (named_capture_it != named_captures_->end()) {
+ ReportError(RegExpError::kDuplicateCaptureGroupName);
+ return false;
+ }
+ }
+
+ named_captures_->emplace(capture);
+
+ return true;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseNamedBackReference(
+ RegExpBuilder* builder, RegExpParserState* state) {
+ // The parser is assumed to be on the '<' in \k<name>.
+ if (current() != '<') {
+ ReportError(RegExpError::kInvalidNamedReference);
+ return false;
+ }
+
+ Advance();
+ const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
+ if (name == nullptr) {
+ return false;
+ }
+
+ if (state->IsInsideCaptureGroup(name)) {
+ builder->AddEmpty();
+ } else {
+ RegExpBackReference* atom =
+ zone()->template New<RegExpBackReference>(builder->flags());
+ atom->set_name(name);
+
+ builder->AddAtom(atom);
+
+ if (named_back_references_ == nullptr) {
+ named_back_references_ =
+ zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
+ }
+ named_back_references_->Add(atom, zone());
+ }
+
+ return true;
+}
+
+template <class CharT>
+void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
+ if (named_back_references_ == nullptr) return;
+
+ if (named_captures_ == nullptr) {
+ ReportError(RegExpError::kInvalidNamedCaptureReference);
+ return;
+ }
+
+ // Look up and patch the actual capture for each named back reference.
+
+ for (int i = 0; i < named_back_references_->length(); i++) {
+ RegExpBackReference* ref = named_back_references_->at(i);
+
+ // Capture used to search the named_captures_ by name, index of the
+ // capture is never used.
+ static const int kInvalidIndex = 0;
+ RegExpCapture* search_capture =
+ zone()->template New<RegExpCapture>(kInvalidIndex);
+ DCHECK_NULL(search_capture->name());
+ search_capture->set_name(ref->name());
+
+ int index = -1;
+ const auto& capture_it = named_captures_->find(search_capture);
+ if (capture_it != named_captures_->end()) {
+ index = (*capture_it)->index();
+ } else {
+ ReportError(RegExpError::kInvalidNamedCaptureReference);
+ return;
+ }
+
+ ref->set_capture(GetCapture(index));
+ }
+}
+
+template <class CharT>
+RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
+ // The index for the capture groups are one-based. Its index in the list is
+ // zero-based.
+ const int known_captures =
+ is_scanned_for_captures_ ? capture_count_ : captures_started_;
+ DCHECK(index <= known_captures);
+ if (captures_ == nullptr) {
+ captures_ =
+ zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
+ }
+ while (captures_->length() < known_captures) {
+ captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
+ zone());
+ }
+ return captures_->at(index - 1);
+}
+
+template <class CharT>
+ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
+ if (named_captures_ == nullptr || named_captures_->empty()) {
+ return nullptr;
+ }
+
+ return zone()->template New<ZoneVector<RegExpCapture*>>(
+ named_captures_->begin(), named_captures_->end(), zone());
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::HasNamedCaptures(
+ InClassEscapeState in_class_escape_state) {
+ if (has_named_captures_ || is_scanned_for_captures_) {
+ return has_named_captures_;
+ }
+
+ ScanForCaptures(in_class_escape_state);
+ DCHECK(is_scanned_for_captures_);
+ return has_named_captures_;
+}
+
+// QuantifierPrefix ::
+// { DecimalDigits }
+// { DecimalDigits , }
+// { DecimalDigits , DecimalDigits }
+//
+// Returns true if parsing succeeds, and set the min_out and max_out
+// values. Values are truncated to RegExpTree::kInfinity if they overflow.
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
+ int* max_out) {
+ DCHECK_EQ(current(), '{');
+ int start = position();
+ Advance();
+ int min = 0;
+ if (!IsDecimalDigit(current())) {
+ Reset(start);
+ return false;
+ }
+ while (IsDecimalDigit(current())) {
+ int next = current() - '0';
+ if (min > (RegExpTree::kInfinity - next) / 10) {
+ // Overflow. Skip past remaining decimal digits and return -1.
+ do {
+ Advance();
+ } while (IsDecimalDigit(current()));
+ min = RegExpTree::kInfinity;
+ break;
+ }
+ min = 10 * min + next;
+ Advance();
+ }
+ int max = 0;
+ if (current() == '}') {
+ max = min;
+ Advance();
+ } else if (current() == ',') {
+ Advance();
+ if (current() == '}') {
+ max = RegExpTree::kInfinity;
+ Advance();
+ } else {
+ while (IsDecimalDigit(current())) {
+ int next = current() - '0';
+ if (max > (RegExpTree::kInfinity - next) / 10) {
+ do {
+ Advance();
+ } while (IsDecimalDigit(current()));
+ max = RegExpTree::kInfinity;
+ break;
+ }
+ max = 10 * max + next;
+ Advance();
+ }
+ if (current() != '}') {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ }
+ } else {
+ Reset(start);
+ return false;
+ }
+ *min_out = min;
+ *max_out = max;
+ return true;
+}
+
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
+ DCHECK(('0' <= current() && current() <= '7') || !has_more());
+ // For compatibility with some other browsers (not all), we parse
+ // up to three octal digits with a value below 256.
+ // ES#prod-annexB-LegacyOctalEscapeSequence
+ base::uc32 value = current() - '0';
+ Advance();
+ if ('0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ if (value < 32 && '0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ }
+ }
+ return value;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
+ int start = position();
+ base::uc32 val = 0;
+ for (int i = 0; i < length; ++i) {
+ base::uc32 c = current();
+ int d = base::HexValue(c);
+ if (d < 0) {
+ Reset(start);
+ return false;
+ }
+ val = val * 16 + d;
+ Advance();
+ }
+ *value = val;
+ return true;
+}
+
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
+ // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
+ // allowed). In the latter case, the number of hex digits between { } is
+ // arbitrary. \ and u have already been read.
+ if (current() == '{' && IsUnicodeMode()) {
+ int start = position();
+ Advance();
+ if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
+ if (current() == '}') {
+ Advance();
+ return true;
+ }
+ }
+ Reset(start);
+ return false;
+ }
+ // \u but no {, or \u{...} escapes not allowed.
+ bool result = ParseHexEscape(4, value);
+ if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+ current() == '\\') {
+ // Attempt to read trail surrogate.
+ int start = position();
+ if (Next() == 'u') {
+ Advance(2);
+ base::uc32 trail;
+ if (ParseHexEscape(4, &trail) &&
+ unibrow::Utf16::IsTrailSurrogate(trail)) {
+ *value = unibrow::Utf16::CombineSurrogatePair(
+ static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
+ return true;
+ }
+ }
+ Reset(start);
+ }
+ return result;
+}
+
+#ifdef V8_INTL_SUPPORT
+
+namespace {
+
+bool IsExactPropertyAlias(const char* property_name, UProperty property) {
+ const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
+ if (short_name != nullptr && strcmp(property_name, short_name) == 0)
+ return true;
+ for (int i = 0;; i++) {
+ const char* long_name = u_getPropertyName(
+ property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
+ if (long_name == nullptr) break;
+ if (strcmp(property_name, long_name) == 0) return true;
+ }
+ return false;
+}
+
+bool IsExactPropertyValueAlias(const char* property_value_name,
+ UProperty property, int32_t property_value) {
+ const char* short_name =
+ u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
+ if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
+ return true;
+ }
+ for (int i = 0;; i++) {
+ const char* long_name = u_getPropertyValueName(
+ property, property_value,
+ static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
+ if (long_name == nullptr) break;
+ if (strcmp(property_value_name, long_name) == 0) return true;
+ }
+ return false;
+}
+
+void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set,
+ CharacterClassStrings* strings,
+ RegExpFlags flags, Zone* zone) {
+ DCHECK(set.hasStrings());
+ DCHECK(IsUnicodeSets(flags));
+ DCHECK_NOT_NULL(strings);
+
+ RegExpTextBuilder::SmallRegExpTreeVector string_storage(
+ ZoneAllocator<RegExpTree*>{zone});
+ RegExpTextBuilder string_builder(zone, &string_storage, flags);
+ const bool needs_case_folding = IsIgnoreCase(flags);
+ icu::UnicodeSetIterator iter(set);
+ iter.skipToStrings();
+ while (iter.next()) {
+ const icu::UnicodeString& s = iter.getString();
+ const char16_t* p = s.getBuffer();
+ int32_t length = s.length();
+ ZoneList<base::uc32>* string =
+ zone->template New<ZoneList<base::uc32>>(length, zone);
+ for (int32_t i = 0; i < length;) {
+ UChar32 c;
+ U16_NEXT(p, i, length, c);
+ string_builder.AddUnicodeCharacter(c);
+ if (needs_case_folding) {
+ c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+ }
+ string->Add(c, zone);
+ }
+ strings->emplace(string->ToVector(), string_builder.ToRegExp());
+ string_storage.clear();
+ }
+}
+
+bool LookupPropertyValueName(UProperty property,
+ const char* property_value_name, bool negate,
+ ZoneList<CharacterRange>* result_ranges,
+ CharacterClassStrings* result_strings,
+ RegExpFlags flags, Zone* zone) {
+ UProperty property_for_lookup = property;
+ if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
+ // For the property Script_Extensions, we have to do the property value
+ // name lookup as if the property is Script.
+ property_for_lookup = UCHAR_SCRIPT;
+ }
+ int32_t property_value =
+ u_getPropertyValueEnum(property_for_lookup, property_value_name);
+ if (property_value == UCHAR_INVALID_CODE) return false;
+
+ // We require the property name to match exactly to one of the property value
+ // aliases. However, u_getPropertyValueEnum uses loose matching.
+ if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
+ property_value)) {
+ return false;
+ }
+
+ UErrorCode ec = U_ZERO_ERROR;
+ icu::UnicodeSet set;
+ set.applyIntPropertyValue(property, property_value, ec);
+ bool success = ec == U_ZERO_ERROR && !set.isEmpty();
+
+ if (success) {
+ if (set.hasStrings()) {
+ ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
+ }
+ const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
+ if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);
+ set.removeAllStrings();
+ if (negate) set.complement();
+ for (int i = 0; i < set.getRangeCount(); i++) {
+ result_ranges->Add(
+ CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
+ zone);
+ }
+ }
+ return success;
+}
+
+template <size_t N>
+inline bool NameEquals(const char* name, const char (&literal)[N]) {
+ return strncmp(name, literal, N + 1) == 0;
+}
+
+bool LookupSpecialPropertyValueName(const char* name,
+ ZoneList<CharacterRange>* result,
+ bool negate, RegExpFlags flags,
+ Zone* zone) {
+ if (NameEquals(name, "Any")) {
+ if (negate) {
+ // Leave the list of character ranges empty, since the negation of 'Any'
+ // is the empty set.
+ } else {
+ result->Add(CharacterRange::Everything(), zone);
+ }
+ } else if (NameEquals(name, "ASCII")) {
+ result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
+ : CharacterRange::Range(0x0, 0x7F),
+ zone);
+ } else if (NameEquals(name, "Assigned")) {
+ return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
+ !negate, result, nullptr, flags, zone);
+ } else {
+ return false;
+ }
+ return true;
+}
+
+// Explicitly allowlist supported binary properties. The spec forbids supporting
+// properties outside of this set to ensure interoperability.
+bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) {
+ switch (property) {
+ case UCHAR_ALPHABETIC:
+ // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
+ // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
+ case UCHAR_ASCII_HEX_DIGIT:
+ // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
+ case UCHAR_BIDI_CONTROL:
+ case UCHAR_BIDI_MIRRORED:
+ case UCHAR_CASE_IGNORABLE:
+ case UCHAR_CASED:
+ case UCHAR_CHANGES_WHEN_CASEFOLDED:
+ case UCHAR_CHANGES_WHEN_CASEMAPPED:
+ case UCHAR_CHANGES_WHEN_LOWERCASED:
+ case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
+ case UCHAR_CHANGES_WHEN_TITLECASED:
+ case UCHAR_CHANGES_WHEN_UPPERCASED:
+ case UCHAR_DASH:
+ case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
+ case UCHAR_DEPRECATED:
+ case UCHAR_DIACRITIC:
+ case UCHAR_EMOJI:
+ case UCHAR_EMOJI_COMPONENT:
+ case UCHAR_EMOJI_MODIFIER_BASE:
+ case UCHAR_EMOJI_MODIFIER:
+ case UCHAR_EMOJI_PRESENTATION:
+ case UCHAR_EXTENDED_PICTOGRAPHIC:
+ case UCHAR_EXTENDER:
+ case UCHAR_GRAPHEME_BASE:
+ case UCHAR_GRAPHEME_EXTEND:
+ case UCHAR_HEX_DIGIT:
+ case UCHAR_ID_CONTINUE:
+ case UCHAR_ID_START:
+ case UCHAR_IDEOGRAPHIC:
+ case UCHAR_IDS_BINARY_OPERATOR:
+ case UCHAR_IDS_TRINARY_OPERATOR:
+ case UCHAR_JOIN_CONTROL:
+ case UCHAR_LOGICAL_ORDER_EXCEPTION:
+ case UCHAR_LOWERCASE:
+ case UCHAR_MATH:
+ case UCHAR_NONCHARACTER_CODE_POINT:
+ case UCHAR_PATTERN_SYNTAX:
+ case UCHAR_PATTERN_WHITE_SPACE:
+ case UCHAR_QUOTATION_MARK:
+ case UCHAR_RADICAL:
+ case UCHAR_REGIONAL_INDICATOR:
+ case UCHAR_S_TERM:
+ case UCHAR_SOFT_DOTTED:
+ case UCHAR_TERMINAL_PUNCTUATION:
+ case UCHAR_UNIFIED_IDEOGRAPH:
+ case UCHAR_UPPERCASE:
+ case UCHAR_VARIATION_SELECTOR:
+ case UCHAR_WHITE_SPACE:
+ case UCHAR_XID_CONTINUE:
+ case UCHAR_XID_START:
+ return true;
+ case UCHAR_BASIC_EMOJI:
+ case UCHAR_EMOJI_KEYCAP_SEQUENCE:
+ case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
+ case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
+ case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
+ case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
+ case UCHAR_RGI_EMOJI:
+ return unicode_sets;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool IsBinaryPropertyOfStrings(UProperty property) {
+ switch (property) {
+ case UCHAR_BASIC_EMOJI:
+ case UCHAR_EMOJI_KEYCAP_SEQUENCE:
+ case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
+ case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
+ case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
+ case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
+ case UCHAR_RGI_EMOJI:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool IsUnicodePropertyValueCharacter(char c) {
+ // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
+ //
+ // Note that using this to validate each parsed char is quite conservative.
+ // A possible alternative solution would be to only ensure the parsed
+ // property name/value candidate string does not contain '\0' characters and
+ // let ICU lookups trigger the final failure.
+ if ('a' <= c && c <= 'z') return true;
+ if ('A' <= c && c <= 'Z') return true;
+ if ('0' <= c && c <= '9') return true;
+ return (c == '_');
+}
+
+} // namespace
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2) {
+ DCHECK(name_1->empty());
+ DCHECK(name_2->empty());
+ // Parse the property class as follows:
+ // - In \p{name}, 'name' is interpreted
+ // - either as a general category property value name.
+ // - or as a binary property name.
+ // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
+ // and 'value' is interpreted as one of the available property value names.
+ // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
+ // - Loose matching is not applied.
+ if (current() == '{') {
+ // Parse \p{[PropertyName=]PropertyNameValue}
+ for (Advance(); current() != '}' && current() != '='; Advance()) {
+ if (!IsUnicodePropertyValueCharacter(current())) return false;
+ if (!has_next()) return false;
+ name_1->push_back(static_cast<char>(current()));
+ }
+ if (current() == '=') {
+ for (Advance(); current() != '}'; Advance()) {
+ if (!IsUnicodePropertyValueCharacter(current())) return false;
+ if (!has_next()) return false;
+ name_2->push_back(static_cast<char>(current()));
+ }
+ name_2->push_back(0); // null-terminate string.
+ }
+ } else {
+ return false;
+ }
+ Advance();
+ name_1->push_back(0); // null-terminate string.
+
+ DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
+ DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
+ return true;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::AddPropertyClassRange(
+ ZoneList<CharacterRange>* add_to_ranges,
+ CharacterClassStrings* add_to_strings, bool negate,
+ const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
+ if (name_2.empty()) {
+ // First attempt to interpret as general category property value name.
+ const char* name = name_1.data();
+ if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
+ add_to_ranges, add_to_strings, flags(),
+ zone())) {
+ return true;
+ }
+ // Interpret "Any", "ASCII", and "Assigned".
+ if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(),
+ zone())) {
+ return true;
+ }
+ // Then attempt to interpret as binary property name with value name 'Y'.
+ UProperty property = u_getPropertyEnum(name);
+ if (!IsSupportedBinaryProperty(property, unicode_sets())) return false;
+ if (!IsExactPropertyAlias(name, property)) return false;
+ // Negation of properties with strings is not allowed.
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // See
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-static-semantics-maycontainstrings
+ if (negate && IsBinaryPropertyOfStrings(property)) return false;
+ return LookupPropertyValueName(property, negate ? "N" : "Y", false,
+ add_to_ranges, add_to_strings, flags(),
+ zone());
+ } else {
+ // Both property name and value name are specified. Attempt to interpret
+ // the property name as enumerated property.
+ const char* property_name = name_1.data();
+ const char* value_name = name_2.data();
+ UProperty property = u_getPropertyEnum(property_name);
+ if (!IsExactPropertyAlias(property_name, property)) return false;
+ if (property == UCHAR_GENERAL_CATEGORY) {
+ // We want to allow aggregate value names such as "Letter".
+ property = UCHAR_GENERAL_CATEGORY_MASK;
+ } else if (property != UCHAR_SCRIPT &&
+ property != UCHAR_SCRIPT_EXTENSIONS) {
+ return false;
+ }
+ return LookupPropertyValueName(property, value_name, negate, add_to_ranges,
+ add_to_strings, flags(), zone());
+ }
+}
+
+#else // V8_INTL_SUPPORT
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2) {
+ return false;
+}
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::AddPropertyClassRange(
+ ZoneList<CharacterRange>* add_to_ranges,
+ CharacterClassStrings* add_to_strings, bool negate,
+ const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
+ return false;
+}
+
+#endif // V8_INTL_SUPPORT
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
+ base::uc32* value) {
+ base::uc32 x = 0;
+ int d = base::HexValue(current());
+ if (d < 0) {
+ return false;
+ }
+ while (d >= 0) {
+ x = x * 16 + d;
+ if (x > static_cast<base::uc32>(max_value)) {
+ return false;
+ }
+ Advance();
+ d = base::HexValue(current());
+ }
+ *value = x;
+ return true;
+}
+
+// https://tc39.es/ecma262/#prod-CharacterEscape
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
+ InClassEscapeState in_class_escape_state,
+ bool* is_escaped_unicode_character) {
+ DCHECK_EQ('\\', current());
+ DCHECK(has_next());
+
+ Advance();
+
+ const base::uc32 c = current();
+ switch (c) {
+ // CharacterEscape ::
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance();
+ return '\f';
+ case 'n':
+ Advance();
+ return '\n';
+ case 'r':
+ Advance();
+ return '\r';
+ case 't':
+ Advance();
+ return '\t';
+ case 'v':
+ Advance();
+ return '\v';
+ // CharacterEscape ::
+ // c ControlLetter
+ case 'c': {
+ base::uc32 controlLetter = Next();
+ base::uc32 letter = controlLetter & ~('A' ^ 'a');
+ if (letter >= 'A' && letter <= 'Z') {
+ Advance(2);
+ // Control letters mapped to ASCII control characters in the range
+ // 0x00-0x1F.
+ return controlLetter & 0x1F;
+ }
+ if (IsUnicodeMode()) {
+ // With /u and /v, invalid escapes are not treated as identity escapes.
+ ReportError(RegExpError::kInvalidUnicodeEscape);
+ return 0;
+ }
+ if (in_class_escape_state == InClassEscapeState::kInClass) {
+ // Inside a character class, we also accept digits and underscore as
+ // control characters, unless with /u or /v. See Annex B:
+ // ES#prod-annexB-ClassControlLetter
+ if ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_') {
+ Advance(2);
+ return controlLetter & 0x1F;
+ }
+ }
+ // We match JSC in reading the backslash as a literal
+ // character instead of as starting an escape.
+ return '\\';
+ }
+ // CharacterEscape ::
+ // 0 [lookahead ∉ DecimalDigit]
+ // [~UnicodeMode] LegacyOctalEscapeSequence
+ case '0':
+ // \0 is interpreted as NUL if not followed by another digit.
+ if (Next() < '0' || Next() > '9') {
+ Advance();
+ return 0;
+ }
+ V8_FALLTHROUGH;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ // For compatibility, we interpret a decimal escape that isn't
+ // a back reference (and therefore either \0 or not valid according
+ // to the specification) as a 1..3 digit octal character code.
+ // ES#prod-annexB-LegacyOctalEscapeSequence
+ if (IsUnicodeMode()) {
+ // With /u or /v, decimal escape is not interpreted as octal character
+ // code.
+ ReportError(RegExpError::kInvalidClassEscape);
+ return 0;
+ }
+ return ParseOctalLiteral();
+ // CharacterEscape ::
+ // HexEscapeSequence
+ case 'x': {
+ Advance();
+ base::uc32 value;
+ if (ParseHexEscape(2, &value)) return value;
+ if (IsUnicodeMode()) {
+ // With /u or /v, invalid escapes are not treated as identity escapes.
+ ReportError(RegExpError::kInvalidEscape);
+ return 0;
+ }
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
+ }
+ // CharacterEscape ::
+ // RegExpUnicodeEscapeSequence [?UnicodeMode]
+ case 'u': {
+ Advance();
+ base::uc32 value;
+ if (ParseUnicodeEscape(&value)) {
+ *is_escaped_unicode_character = true;
+ return value;
+ }
+ if (IsUnicodeMode()) {
+ // With /u or /v, invalid escapes are not treated as identity escapes.
+ ReportError(RegExpError::kInvalidUnicodeEscape);
+ return 0;
+ }
+ // If \u is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'u';
+ }
+ default:
+ break;
+ }
+
+ // CharacterEscape ::
+ // IdentityEscape[?UnicodeMode, ?N]
+ //
+ // * With /u, no identity escapes except for syntax characters are
+ // allowed.
+ // * With /v, no identity escapes except for syntax characters and
+ // ClassSetReservedPunctuators (if within a class) are allowed.
+ // * Without /u or /v:
+ // * '\c' is not an IdentityEscape.
+ // * '\k' is not an IdentityEscape when named captures exist.
+ // * Otherwise, all identity escapes are allowed.
+ if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) {
+ if (IsClassSetReservedPunctuator(c)) {
+ Advance();
+ return c;
+ }
+ }
+ if (IsUnicodeMode()) {
+ if (!IsSyntaxCharacterOrSlash(c)) {
+ ReportError(RegExpError::kInvalidEscape);
+ return 0;
+ }
+ Advance();
+ return c;
+ }
+ DCHECK(!IsUnicodeMode());
+ if (c == 'c') {
+ ReportError(RegExpError::kInvalidEscape);
+ return 0;
+ }
+ Advance();
+ // Note: It's important to Advance before the HasNamedCaptures call s.t. we
+ // don't start scanning in the middle of an escape.
+ if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
+ ReportError(RegExpError::kInvalidEscape);
+ return 0;
+ }
+ return c;
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassRanges
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(
+ ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) {
+ base::uc32 char_1, char_2;
+ bool is_class_1, is_class_2;
+ while (has_more() && current() != ']') {
+ ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
+ &is_class_1 CHECK_FAILED);
+ // ClassAtom
+ if (current() == '-') {
+ Advance();
+ if (!has_more()) {
+ // If we reach the end we break out of the loop and let the
+ // following code report an error.
+ break;
+ } else if (current() == ']') {
+ if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+ ranges->Add(CharacterRange::Singleton('-'), zone());
+ break;
+ }
+ ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
+ &is_class_2 CHECK_FAILED);
+ if (is_class_1 || is_class_2) {
+ // Either end is an escaped character class. Treat the '-' verbatim.
+ if (IsUnicodeMode()) {
+ // ES2015 21.2.2.15.1 step 1.
+ return ReportError(RegExpError::kInvalidCharacterClass);
+ }
+ if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+ ranges->Add(CharacterRange::Singleton('-'), zone());
+ if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
+ continue;
+ }
+ // ES2015 21.2.2.15.1 step 6.
+ if (char_1 > char_2) {
+ return ReportError(RegExpError::kOutOfOrderCharacterClass);
+ }
+ ranges->Add(CharacterRange::Range(char_1, char_2), zone());
+ } else {
+ if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+ }
+ }
+ return nullptr;
+}
+
+// https://tc39.es/ecma262/#prod-ClassEscape
+template <class CharT>
+void RegExpParserImpl<CharT>::ParseClassEscape(
+ ZoneList<CharacterRange>* ranges, Zone* zone,
+ bool add_unicode_case_equivalents, base::uc32* char_out,
+ bool* is_class_escape) {
+ *is_class_escape = false;
+
+ if (current() != '\\') {
+ // Not a ClassEscape.
+ *char_out = current();
+ Advance();
+ return;
+ }
+
+ const base::uc32 next = Next();
+ switch (next) {
+ case 'b':
+ *char_out = '\b';
+ Advance(2);
+ return;
+ case '-':
+ if (IsUnicodeMode()) {
+ *char_out = next;
+ Advance(2);
+ return;
+ }
+ break;
+ case kEndMarker:
+ ReportError(RegExpError::kEscapeAtEndOfPattern);
+ return;
+ default:
+ break;
+ }
+
+ static constexpr InClassEscapeState kInClassEscape =
+ InClassEscapeState::kInClass;
+ *is_class_escape =
+ TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone,
+ add_unicode_case_equivalents);
+ if (*is_class_escape) return;
+
+ bool dummy = false; // Unused.
+ *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
+}
+
+// https://tc39.es/ecma262/#prod-CharacterClassEscape
+template <class CharT>
+bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
+ base::uc32 next, InClassEscapeState in_class_escape_state,
+ ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
+ Zone* zone, bool add_unicode_case_equivalents) {
+ DCHECK_EQ(current(), '\\');
+ DCHECK_EQ(Next(), next);
+
+ switch (next) {
+ case 'd':
+ case 'D':
+ case 's':
+ case 'S':
+ case 'w':
+ case 'W':
+ CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
+ ranges, add_unicode_case_equivalents,
+ zone);
+ Advance(2);
+ return true;
+ case 'p':
+ case 'P': {
+ if (!IsUnicodeMode()) return false;
+ bool negate = next == 'P';
+ Advance(2);
+ ZoneVector<char> name_1(zone);
+ ZoneVector<char> name_2(zone);
+ if (!ParsePropertyClassName(&name_1, &name_2) ||
+ !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) {
+ ReportError(in_class_escape_state == InClassEscapeState::kInClass
+ ? RegExpError::kInvalidClassPropertyName
+ : RegExpError::kInvalidPropertyName);
+ }
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+namespace {
+
+// Add |string| to |ranges| if length of |string| == 1, otherwise add |string|
+// to |strings|.
+void AddClassString(ZoneList<base::uc32>* normalized_string,
+ RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings, Zone* zone) {
+ if (normalized_string->length() == 1) {
+ ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);
+ } else {
+ strings->emplace(normalized_string->ToVector(), regexp_string);
+ }
+}
+
+} // namespace
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassStringDisjunction
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
+ ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
+ DCHECK(unicode_sets());
+ DCHECK_EQ(current(), '\\');
+ DCHECK_EQ(Next(), 'q');
+ Advance(2);
+ if (current() != '{') {
+ // Identity escape of 'q' is not allowed in unicode mode.
+ return ReportError(RegExpError::kInvalidEscape);
+ }
+ Advance();
+
+ ZoneList<base::uc32>* string =
+ zone()->template New<ZoneList<base::uc32>>(4, zone());
+ RegExpTextBuilder::SmallRegExpTreeVector string_storage(
+ ZoneAllocator<RegExpTree*>{zone()});
+ RegExpTextBuilder string_builder(zone(), &string_storage, flags());
+
+ while (has_more() && current() != '}') {
+ if (current() == '|') {
+ AddClassString(string, string_builder.ToRegExp(), ranges, strings,
+ zone());
+ string = zone()->template New<ZoneList<base::uc32>>(4, zone());
+ string_storage.clear();
+ Advance();
+ } else {
+ base::uc32 c = ParseClassSetCharacter(CHECK_FAILED);
+ if (ignore_case()) {
+#ifdef V8_INTL_SUPPORT
+ c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+#else
+ c = AsciiAlphaToLower(c);
+#endif
+ }
+ string->Add(c, zone());
+ string_builder.AddUnicodeCharacter(c);
+ }
+ }
+
+ AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone());
+ CharacterRange::Canonicalize(ranges);
+
+ // We don't need to handle missing closing '}' here.
+ // If the character class is correctly closed, ParseClassSetCharacter will
+ // report an error.
+ Advance();
+ return nullptr;
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
+// Tree returned based on type_out:
+// * kNestedClass: RegExpClassSetExpression
+// * For all other types: RegExpClassSetOperand
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
+ const RegExpBuilder* builder, ClassSetOperandType* type_out) {
+ ZoneList<CharacterRange>* ranges =
+ zone()->template New<ZoneList<CharacterRange>>(1, zone());
+ CharacterClassStrings* strings =
+ zone()->template New<CharacterClassStrings>(zone());
+ RegExpTree* tree =
+ ParseClassSetOperand(builder, type_out, ranges, strings CHECK_FAILED);
+ DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,
+ tree == nullptr);
+ DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
+ ranges->length() == 1);
+ DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
+ strings->empty());
+ DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
+ ranges->is_empty());
+ DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
+ strings->empty());
+ DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
+ tree->IsClassSetExpression());
+ // ClassSetRange is only used within ClassSetUnion().
+ DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);
+ // There are no restrictions for kCharacterClassEscape.
+ // CharacterClassEscape includes \p{}, which can contain ranges, strings or
+ // both and \P{}, which could contain nothing (i.e. \P{Any}).
+ if (tree == nullptr) {
+ tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
+ }
+ return tree;
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand
+// Based on |type_out| either a tree is returned or ranges/strings modified.
+// If a tree is returned, ranges/strings are not modified.
+// If |type_out| is kNestedClass, a tree of type RegExpClassSetExpression is
+// returned. For all other types, ranges is modified and nullptr is returned.
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
+ const RegExpBuilder* builder, ClassSetOperandType* type_out,
+ ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
+ DCHECK(unicode_sets());
+ base::uc32 c = current();
+ if (c == '\\') {
+ const base::uc32 next = Next();
+ if (next == 'q') {
+ *type_out = ClassSetOperandType::kClassStringDisjunction;
+ ParseClassStringDisjunction(ranges, strings CHECK_FAILED);
+ return nullptr;
+ }
+ static constexpr InClassEscapeState kInClassEscape =
+ InClassEscapeState::kInClass;
+ const bool add_unicode_case_equivalents = ignore_case();
+ if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings,
+ zone(), add_unicode_case_equivalents)) {
+ *type_out = ClassSetOperandType::kCharacterClassEscape;
+ return nullptr;
+ }
+ }
+
+ if (c == '[') {
+ *type_out = ClassSetOperandType::kNestedClass;
+ return ParseCharacterClass(builder);
+ }
+
+ *type_out = ClassSetOperandType::kClassSetCharacter;
+ c = ParseClassSetCharacter(CHECK_FAILED);
+ ranges->Add(CharacterRange::Singleton(c), zone());
+ return nullptr;
+}
+
+template <class CharT>
+base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {
+ DCHECK(unicode_sets());
+ const base::uc32 c = current();
+ if (c == '\\') {
+ const base::uc32 next = Next();
+ switch (next) {
+ case 'b':
+ Advance(2);
+ return '\b';
+ case kEndMarker:
+ ReportError(RegExpError::kEscapeAtEndOfPattern);
+ return 0;
+ }
+ static constexpr InClassEscapeState kInClassEscape =
+ InClassEscapeState::kInClass;
+
+ bool dummy = false; // Unused.
+ return ParseCharacterEscape(kInClassEscape, &dummy);
+ }
+ if (IsClassSetSyntaxCharacter(c)) {
+ ReportError(RegExpError::kInvalidCharacterInClass);
+ return 0;
+ }
+ if (IsClassSetReservedDoublePunctuator(c)) {
+ ReportError(RegExpError::kInvalidClassSetOperation);
+ return 0;
+ }
+ Advance();
+ return c;
+}
+
+namespace {
+
+bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
+ switch (type) {
+ case ClassSetOperandType::kClassSetCharacter:
+ case ClassSetOperandType::kClassSetRange:
+ return false;
+ case ClassSetOperandType::kCharacterClassEscape:
+ case ClassSetOperandType::kClassStringDisjunction:
+ return operand->AsClassSetOperand()->has_strings();
+ case ClassSetOperandType::kNestedClass:
+ if (operand->IsClassRanges()) return false;
+ return operand->AsClassSetExpression()->may_contain_strings();
+ }
+}
+
+} // namespace
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassUnion
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
+ const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
+ ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
+ CharacterClassStrings* strings) {
+ DCHECK(unicode_sets());
+ ZoneList<RegExpTree*>* operands =
+ zone()->template New<ZoneList<RegExpTree*>>(2, zone());
+ bool may_contain_strings = false;
+ // Add the lhs to operands if necessary.
+ // Either the lhs values were added to |ranges|/|strings| (in which case
+ // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed
+ // as |first_operand| (in which case |ranges| and |strings| are empty).
+ if (first_operand != nullptr) {
+ may_contain_strings = MayContainStrings(first_operand_type, first_operand);
+ operands->Add(first_operand, zone());
+ }
+ ClassSetOperandType last_type = first_operand_type;
+ const bool needs_case_folding = ignore_case();
+ while (has_more() && current() != ']') {
+ if (current() == '-') {
+ // Mix of ClassSetRange and ClassSubtraction is not allowed.
+ if (Next() == '-') {
+ return ReportError(RegExpError::kInvalidClassSetOperation);
+ }
+ Advance();
+ if (!has_more()) {
+ // If we reach the end we break out of the loop and let the
+ // following code report an error.
+ break;
+ }
+ // If the lhs and rhs around '-' are both ClassSetCharacters, they
+ // represent a character range.
+ // In case one of them is not a ClassSetCharacter, it is a syntax error,
+ // as '-' can not be used unescaped within a class with /v.
+ // TODO(v8:11935): Change permalink once proposal is in stage 4.
+ // See
+ // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetRange
+ if (last_type != ClassSetOperandType::kClassSetCharacter) {
+ return ReportError(RegExpError::kInvalidCharacterClass);
+ }
+ ParseClassSetOperand(builder, &last_type, ranges, strings CHECK_FAILED);
+ if (last_type != ClassSetOperandType::kClassSetCharacter) {
+ return ReportError(RegExpError::kInvalidCharacterClass);
+ }
+ // Remove the last two singleton characters added to ranges, and combine
+ // them into a range.
+ auto rhs_ranges = ranges->RemoveLast();
+ auto lhs_ranges = ranges->RemoveLast();
+ DCHECK(lhs_ranges.IsSingleton());
+ DCHECK(rhs_ranges.IsSingleton());
+ base::uc32 from = lhs_ranges.from();
+ base::uc32 to = rhs_ranges.from();
+ if (from > to) {
+ return ReportError(RegExpError::kOutOfOrderCharacterClass);
+ }
+ ranges->Add(CharacterRange::Range(from, to), zone());
+ last_type = ClassSetOperandType::kClassSetRange;
+ } else {
+ DCHECK_NE(current(), '-');
+ RegExpTree* operand = ParseClassSetOperand(builder, &last_type, ranges,
+ strings CHECK_FAILED);
+ if (operand != nullptr) {
+ may_contain_strings |= MayContainStrings(last_type, operand);
+ // Add the range we started building as operand and reset the current
+ // range.
+ if (!ranges->is_empty() || !strings->empty()) {
+ if (needs_case_folding) {
+ CharacterRange::Canonicalize(ranges);
+ CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
+ }
+ may_contain_strings |= !strings->empty();
+ operands->Add(
+ zone()->template New<RegExpClassSetOperand>(ranges, strings),
+ zone());
+ ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());
+ strings = zone()->template New<CharacterClassStrings>(zone());
+ }
+ operands->Add(operand, zone());
+ }
+ }
+ }
+
+ if (!has_more()) {
+ return ReportError(RegExpError::kUnterminatedCharacterClass);
+ }
+
+ // Add the range we started building as operand.
+ if (!ranges->is_empty() || !strings->empty()) {
+ if (needs_case_folding) {
+ CharacterRange::Canonicalize(ranges);
+ CharacterRange::AddUnicodeCaseEquivalents(ranges, zone());
+ }
+ may_contain_strings |= !strings->empty();
+ operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),
+ zone());
+ }
+
+ DCHECK_EQ(current(), ']');
+ Advance();
+
+ if (is_negated && may_contain_strings) {
+ return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
+ }
+
+ if (operands->is_empty()) {
+ // Return empty expression if no operands were added (e.g. [\P{Any}]
+ // produces an empty range).
+ DCHECK(ranges->is_empty());
+ DCHECK(strings->empty());
+ return RegExpClassSetExpression::Empty(zone(), is_negated);
+ }
+
+ return zone()->template New<RegExpClassSetExpression>(
+ RegExpClassSetExpression::OperationType::kUnion, is_negated,
+ may_contain_strings, operands);
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassIntersection
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
+ const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
+ ClassSetOperandType first_operand_type) {
+ DCHECK(unicode_sets());
+ DCHECK(current() == '&' && Next() == '&');
+ bool may_contain_strings =
+ MayContainStrings(first_operand_type, first_operand);
+ ZoneList<RegExpTree*>* operands =
+ zone()->template New<ZoneList<RegExpTree*>>(2, zone());
+ operands->Add(first_operand, zone());
+ while (has_more() && current() != ']') {
+ if (current() != '&' || Next() != '&') {
+ return ReportError(RegExpError::kInvalidClassSetOperation);
+ }
+ Advance(2);
+ // [lookahead ≠ &]
+ if (current() == '&') {
+ return ReportError(RegExpError::kInvalidCharacterInClass);
+ }
+
+ ClassSetOperandType operand_type;
+ RegExpTree* operand =
+ ParseClassSetOperand(builder, &operand_type CHECK_FAILED);
+ may_contain_strings &= MayContainStrings(operand_type, operand);
+ operands->Add(operand, zone());
+ }
+ if (!has_more()) {
+ return ReportError(RegExpError::kUnterminatedCharacterClass);
+ }
+ if (is_negated && may_contain_strings) {
+ return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
+ }
+ DCHECK_EQ(current(), ']');
+ Advance();
+ return zone()->template New<RegExpClassSetExpression>(
+ RegExpClassSetExpression::OperationType::kIntersection, is_negated,
+ may_contain_strings, operands);
+}
+
+// TODO(v8:11935): Change permalink once proposal is in stage 4.
+// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSubtraction
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
+ const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
+ ClassSetOperandType first_operand_type) {
+ DCHECK(unicode_sets());
+ DCHECK(current() == '-' && Next() == '-');
+ const bool may_contain_strings =
+ MayContainStrings(first_operand_type, first_operand);
+ if (is_negated && may_contain_strings) {
+ return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
+ }
+ ZoneList<RegExpTree*>* operands =
+ zone()->template New<ZoneList<RegExpTree*>>(2, zone());
+ operands->Add(first_operand, zone());
+ while (has_more() && current() != ']') {
+ if (current() != '-' || Next() != '-') {
+ return ReportError(RegExpError::kInvalidClassSetOperation);
+ }
+ Advance(2);
+ ClassSetOperandType dummy; // unused
+ RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED);
+ operands->Add(operand, zone());
+ }
+ if (!has_more()) {
+ return ReportError(RegExpError::kUnterminatedCharacterClass);
+ }
+ DCHECK_EQ(current(), ']');
+ Advance();
+ return zone()->template New<RegExpClassSetExpression>(
+ RegExpClassSetExpression::OperationType::kSubtraction, is_negated,
+ may_contain_strings, operands);
+}
+
+// https://tc39.es/ecma262/#prod-CharacterClass
+template <class CharT>
+RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
+ const RegExpBuilder* builder) {
+ DCHECK_EQ(current(), '[');
+ Advance();
+ bool is_negated = false;
+ if (current() == '^') {
+ is_negated = true;
+ Advance();
+ }
+ ZoneList<CharacterRange>* ranges =
+ zone()->template New<ZoneList<CharacterRange>>(2, zone());
+ if (current() == ']') {
+ Advance();
+ if (unicode_sets()) {
+ return RegExpClassSetExpression::Empty(zone(), is_negated);
+ } else {
+ RegExpClassRanges::ClassRangesFlags class_ranges_flags;
+ if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
+ return zone()->template New<RegExpClassRanges>(zone(), ranges,
+ class_ranges_flags);
+ }
+ }
+
+ if (!unicode_sets()) {
+ bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();
+ ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED);
+ if (!has_more()) {
+ return ReportError(RegExpError::kUnterminatedCharacterClass);
+ }
+ DCHECK_EQ(current(), ']');
+ Advance();
+ RegExpClassRanges::ClassRangesFlags character_class_flags;
+ if (is_negated) character_class_flags = RegExpClassRanges::NEGATED;
+ return zone()->template New<RegExpClassRanges>(zone(), ranges,
+ character_class_flags);
+ } else {
+ ClassSetOperandType operand_type;
+ CharacterClassStrings* strings =
+ zone()->template New<CharacterClassStrings>(zone());
+ RegExpTree* operand = ParseClassSetOperand(builder, &operand_type, ranges,
+ strings CHECK_FAILED);
+ switch (current()) {
+ case '-':
+ if (Next() == '-') {
+ if (operand == nullptr) {
+ operand =
+ zone()->template New<RegExpClassSetOperand>(ranges, strings);
+ }
+ return ParseClassSubtraction(builder, is_negated, operand,
+ operand_type);
+ }
+ // ClassSetRange is handled in ParseClassUnion().
+ break;
+ case '&':
+ if (Next() == '&') {
+ if (operand == nullptr) {
+ operand =
+ zone()->template New<RegExpClassSetOperand>(ranges, strings);
+ }
+ return ParseClassIntersection(builder, is_negated, operand,
+ operand_type);
+ }
+ }
+ return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
+ strings);
+ }
+}
+
+#undef CHECK_FAILED
+
+template <class CharT>
+bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
+ DCHECK_NOT_NULL(result);
+ RegExpTree* tree = ParsePattern();
+
+ if (failed()) {
+ DCHECK_NULL(tree);
+ DCHECK_NE(error_, RegExpError::kNone);
+ result->error = error_;
+ result->error_pos = error_pos_;
+ return false;
+ }
+
+ DCHECK_NOT_NULL(tree);
+ DCHECK_EQ(error_, RegExpError::kNone);
+ if (v8_flags.trace_regexp_parser) {
+ StdoutStream os;
+ tree->Print(os, zone());
+ os << "\n";
+ }
+
+ result->tree = tree;
+ const int capture_count = captures_started();
+ result->simple = tree->IsAtom() && simple() && capture_count == 0;
+ result->contains_anchor = contains_anchor();
+ result->capture_count = capture_count;
+ result->named_captures = GetNamedCaptures();
+ return true;
+}
+
+void RegExpBuilder::FlushText() { text_builder().FlushText(); }
+
+void RegExpBuilder::AddCharacter(base::uc16 c) {
+ pending_empty_ = false;
+ text_builder().AddCharacter(c);
+}
+
+void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
+ pending_empty_ = false;
+ text_builder().AddUnicodeCharacter(c);
+}
+
+void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
+ pending_empty_ = false;
+ text_builder().AddEscapedUnicodeCharacter(character);
+}
+
+void RegExpBuilder::AddEmpty() {
+ text_builder().FlushPendingSurrogate();
+ pending_empty_ = true;
+}
+
+void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) {
+ pending_empty_ = false;
+ text_builder().AddClassRanges(cc);
+}
+
+void RegExpBuilder::AddAtom(RegExpTree* term) {
+ if (term->IsEmpty()) {
+ AddEmpty();
+ return;
+ }
+ pending_empty_ = false;
+ if (term->IsTextElement()) {
+ text_builder().AddAtom(term);
+ } else {
+ FlushText();
+ terms_.emplace_back(term);
+ }
+}
+
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+ DCHECK(!term->IsEmpty());
+ pending_empty_ = false;
+ if (term->IsTextElement()) {
+ text_builder().AddTerm(term);
+ } else {
+ FlushText();
+ terms_.emplace_back(term);
+ }
+}
+
+void RegExpBuilder::AddAssertion(RegExpTree* assert) {
+ FlushText();
+ pending_empty_ = false;
+ terms_.emplace_back(assert);
+}
+
+void RegExpBuilder::NewAlternative() { FlushTerms(); }
+
+void RegExpBuilder::FlushTerms() {
+ FlushText();
+ size_t num_terms = terms_.size();
+ RegExpTree* alternative;
+ if (num_terms == 0) {
+ alternative = zone()->New<RegExpEmpty>();
+ } else if (num_terms == 1) {
+ alternative = terms_.back();
+ } else {
+ alternative =
+ zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
+ base::VectorOf(terms_.begin(), terms_.size()), zone()));
+ }
+ alternatives_.emplace_back(alternative);
+ terms_.clear();
+}
+
+RegExpTree* RegExpBuilder::ToRegExp() {
+ FlushTerms();
+ size_t num_alternatives = alternatives_.size();
+ if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
+ if (num_alternatives == 1) return alternatives_.back();
+ return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
+ base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
+}
+
+bool RegExpBuilder::AddQuantifierToAtom(
+ int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+ if (pending_empty_) {
+ pending_empty_ = false;
+ return true;
+ }
+ RegExpTree* atom = text_builder().PopLastAtom();
+ if (atom != nullptr) {
+ FlushText();
+ } else if (terms_.size() > 0) {
+ atom = terms_.back();
+ terms_.pop_back();
+ if (atom->IsLookaround()) {
+ // With /u or /v, lookarounds are not quantifiable.
+ if (IsUnicodeMode()) return false;
+ // Lookbehinds are not quantifiable.
+ if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
+ return false;
+ }
+ }
+ if (atom->max_match() == 0) {
+ // Guaranteed to only match an empty string.
+ if (min == 0) {
+ return true;
+ }
+ terms_.emplace_back(atom);
+ return true;
+ }
+ } else {
+ // Only call immediately after adding an atom or character!
+ UNREACHABLE();
+ }
+ terms_.emplace_back(
+ zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));
+ return true;
+}
+
+template class RegExpParserImpl<uint8_t>;
+template class RegExpParserImpl<base::uc16>;
+
+} // namespace
+
+// static
+bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
+ Handle<String> input,
+ RegExpFlags flags,
+ RegExpCompileData* result) {
+ DisallowGarbageCollection no_gc;
+ uintptr_t stack_limit = isolate->stack_guard()->real_climit();
+ String::FlatContent content = input->GetFlatContent(no_gc);
+ if (content.IsOneByte()) {
+ base::Vector<const uint8_t> v = content.ToOneByteVector();
+ return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags,
+ stack_limit, zone, no_gc}
+ .Parse(result);
+ } else {
+ base::Vector<const base::uc16> v = content.ToUC16Vector();
+ return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags,
+ stack_limit, zone, no_gc}
+ .Parse(result);
+ }
+}
+
+// static
+template <class CharT>
+bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
+ const CharT* input, int input_length,
+ RegExpFlags flags,
+ RegExpCompileData* result,
+ const DisallowGarbageCollection& no_gc) {
+ return RegExpParserImpl<CharT>{input, input_length, flags,
+ stack_limit, zone, no_gc}
+ .Parse(result);
+}
+
+template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
+ Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
+ const DisallowGarbageCollection&);
+template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
+ Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
+ const DisallowGarbageCollection&);
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-parser.h b/js/src/irregexp/imported/regexp-parser.h
new file mode 100644
index 0000000000..1e45d97532
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-parser.h
@@ -0,0 +1,34 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_PARSER_H_
+#define V8_REGEXP_REGEXP_PARSER_H_
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class String;
+class Zone;
+
+struct RegExpCompileData;
+
+class V8_EXPORT_PRIVATE RegExpParser : public AllStatic {
+ public:
+ static bool ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
+ Handle<String> input, RegExpFlags flags,
+ RegExpCompileData* result);
+
+ template <class CharT>
+ static bool VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
+ const CharT* input, int input_length,
+ RegExpFlags flags, RegExpCompileData* result,
+ const DisallowGarbageCollection& no_gc);
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_PARSER_H_
diff --git a/js/src/irregexp/imported/regexp-stack.cc b/js/src/irregexp/imported/regexp-stack.cc
new file mode 100644
index 0000000000..ad0aedc67a
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-stack.cc
@@ -0,0 +1,96 @@
+// Copyright 2009 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "irregexp/imported/regexp-stack.h"
+
+
+namespace v8 {
+namespace internal {
+
+RegExpStackScope::RegExpStackScope(Isolate* isolate)
+ : regexp_stack_(isolate->regexp_stack()),
+ old_sp_top_delta_(regexp_stack_->sp_top_delta()) {
+ DCHECK(regexp_stack_->IsValid());
+}
+
+RegExpStackScope::~RegExpStackScope() {
+ CHECK_EQ(old_sp_top_delta_, regexp_stack_->sp_top_delta());
+ regexp_stack_->ResetIfEmpty();
+}
+
+RegExpStack::RegExpStack() : thread_local_(this) {}
+
+RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); }
+
+char* RegExpStack::ArchiveStack(char* to) {
+ if (!thread_local_.owns_memory_) {
+ // Force dynamic stacks prior to archiving. Any growth will do. A dynamic
+ // stack is needed because stack archival & restoration rely on `memory_`
+ // pointing at a fixed-location backing store, whereas the static stack is
+ // tied to a RegExpStack instance.
+ EnsureCapacity(thread_local_.memory_size_ + 1);
+ DCHECK(thread_local_.owns_memory_);
+ }
+
+ MemCopy(reinterpret_cast<void*>(to), &thread_local_, kThreadLocalSize);
+ thread_local_ = ThreadLocal(this);
+ return to + kThreadLocalSize;
+}
+
+
+char* RegExpStack::RestoreStack(char* from) {
+ MemCopy(&thread_local_, reinterpret_cast<void*>(from), kThreadLocalSize);
+ return from + kThreadLocalSize;
+}
+
+void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) {
+ if (owns_memory_) DeleteArray(memory_);
+
+ memory_ = regexp_stack->static_stack_;
+ memory_top_ = regexp_stack->static_stack_ + kStaticStackSize;
+ memory_size_ = kStaticStackSize;
+ stack_pointer_ = memory_top_;
+ limit_ = reinterpret_cast<Address>(regexp_stack->static_stack_) +
+ kStackLimitSlack * kSystemPointerSize;
+ owns_memory_ = false;
+}
+
+void RegExpStack::ThreadLocal::FreeAndInvalidate() {
+ if (owns_memory_) DeleteArray(memory_);
+
+ // This stack may not be used after being freed. Just reset to invalid values
+ // to ensure we don't accidentally use old memory areas.
+ memory_ = nullptr;
+ memory_top_ = nullptr;
+ memory_size_ = 0;
+ stack_pointer_ = nullptr;
+ limit_ = kMemoryTop;
+}
+
+Address RegExpStack::EnsureCapacity(size_t size) {
+ if (size > kMaximumStackSize) return kNullAddress;
+ if (thread_local_.memory_size_ < size) {
+ if (size < kMinimumDynamicStackSize) size = kMinimumDynamicStackSize;
+ byte* new_memory = NewArray<byte>(size);
+ if (thread_local_.memory_size_ > 0) {
+ // Copy original memory into top of new memory.
+ MemCopy(new_memory + size - thread_local_.memory_size_,
+ thread_local_.memory_, thread_local_.memory_size_);
+ if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_);
+ }
+ ptrdiff_t delta = sp_top_delta();
+ thread_local_.memory_ = new_memory;
+ thread_local_.memory_top_ = new_memory + size;
+ thread_local_.memory_size_ = size;
+ thread_local_.stack_pointer_ = thread_local_.memory_top_ + delta;
+ thread_local_.limit_ = reinterpret_cast<Address>(new_memory) +
+ kStackLimitSlack * kSystemPointerSize;
+ thread_local_.owns_memory_ = true;
+ }
+ return reinterpret_cast<Address>(thread_local_.memory_top_);
+}
+
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/irregexp/imported/regexp-stack.h b/js/src/irregexp/imported/regexp-stack.h
new file mode 100644
index 0000000000..f03898bb00
--- /dev/null
+++ b/js/src/irregexp/imported/regexp-stack.h
@@ -0,0 +1,159 @@
+// Copyright 2009 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_STACK_H_
+#define V8_REGEXP_REGEXP_STACK_H_
+
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class RegExpStack;
+
+// Maintains a per-v8thread stack area that can be used by irregexp
+// implementation for its backtracking stack.
+class V8_NODISCARD RegExpStackScope final {
+ public:
+ // Create and delete an instance to control the life-time of a growing stack.
+
+ // Initializes the stack memory area if necessary.
+ explicit RegExpStackScope(Isolate* isolate);
+ ~RegExpStackScope(); // Releases the stack if it has grown.
+ RegExpStackScope(const RegExpStackScope&) = delete;
+ RegExpStackScope& operator=(const RegExpStackScope&) = delete;
+
+ RegExpStack* stack() const { return regexp_stack_; }
+
+ private:
+ RegExpStack* const regexp_stack_;
+ const ptrdiff_t old_sp_top_delta_;
+};
+
+class RegExpStack final {
+ public:
+ RegExpStack();
+ ~RegExpStack();
+ RegExpStack(const RegExpStack&) = delete;
+ RegExpStack& operator=(const RegExpStack&) = delete;
+
+ // Number of allocated locations on the stack below the limit. No sequence of
+ // pushes must be longer than this without doing a stack-limit check.
+ static constexpr int kStackLimitSlack = 32;
+
+ Address memory_top() const {
+ DCHECK_NE(0, thread_local_.memory_size_);
+ DCHECK_EQ(thread_local_.memory_top_,
+ thread_local_.memory_ + thread_local_.memory_size_);
+ return reinterpret_cast<Address>(thread_local_.memory_top_);
+ }
+
+ Address stack_pointer() const {
+ return reinterpret_cast<Address>(thread_local_.stack_pointer_);
+ }
+
+ size_t memory_size() const { return thread_local_.memory_size_; }
+
+ // If the stack pointer gets below the limit, we should react and
+ // either grow the stack or report an out-of-stack exception.
+ // There is only a limited number of locations below the stack limit,
+ // so users of the stack should check the stack limit during any
+ // sequence of pushes longer that this.
+ Address* limit_address_address() { return &thread_local_.limit_; }
+
+ // Ensures that there is a memory area with at least the specified size.
+ // If passing zero, the default/minimum size buffer is allocated.
+ Address EnsureCapacity(size_t size);
+
+ // Thread local archiving.
+ static constexpr int ArchiveSpacePerThread() {
+ return static_cast<int>(kThreadLocalSize);
+ }
+ char* ArchiveStack(char* to);
+ char* RestoreStack(char* from);
+ void FreeThreadResources() { thread_local_.ResetToStaticStack(this); }
+
+ // Maximal size of allocated stack area.
+ static constexpr size_t kMaximumStackSize = 64 * MB;
+
+ private:
+ // Artificial limit used when the thread-local state has been destroyed.
+ static const Address kMemoryTop =
+ static_cast<Address>(static_cast<uintptr_t>(-1));
+
+ // Minimal size of dynamically-allocated stack area.
+ static constexpr size_t kMinimumDynamicStackSize = 1 * KB;
+
+ // In addition to dynamically-allocated, variable-sized stacks, we also have
+ // a statically allocated and sized area that is used whenever no dynamic
+ // stack is allocated. This guarantees that a stack is always available and
+ // we can skip availability-checks later on.
+ // It's double the slack size to ensure that we have a bit of breathing room
+ // before NativeRegExpMacroAssembler::GrowStack must be called.
+ static constexpr size_t kStaticStackSize =
+ 2 * kStackLimitSlack * kSystemPointerSize;
+ byte static_stack_[kStaticStackSize] = {0};
+
+ static_assert(kStaticStackSize <= kMaximumStackSize);
+
+ // Structure holding the allocated memory, size and limit. Thread switching
+ // archives and restores this struct.
+ struct ThreadLocal {
+ explicit ThreadLocal(RegExpStack* regexp_stack) {
+ ResetToStaticStack(regexp_stack);
+ }
+
+ // If memory_size_ > 0 then
+ // - memory_, memory_top_, stack_pointer_ must be non-nullptr
+ // - memory_top_ = memory_ + memory_size_
+ // - memory_ <= stack_pointer_ <= memory_top_
+ byte* memory_ = nullptr;
+ byte* memory_top_ = nullptr;
+ size_t memory_size_ = 0;
+ byte* stack_pointer_ = nullptr;
+ Address limit_ = kNullAddress;
+ bool owns_memory_ = false; // Whether memory_ is owned and must be freed.
+
+ void ResetToStaticStack(RegExpStack* regexp_stack);
+ void ResetToStaticStackIfEmpty(RegExpStack* regexp_stack) {
+ if (stack_pointer_ == memory_top_) ResetToStaticStack(regexp_stack);
+ }
+ void FreeAndInvalidate();
+ };
+ static constexpr size_t kThreadLocalSize = sizeof(ThreadLocal);
+
+ Address memory_top_address_address() {
+ return reinterpret_cast<Address>(&thread_local_.memory_top_);
+ }
+
+ Address stack_pointer_address() {
+ return reinterpret_cast<Address>(&thread_local_.stack_pointer_);
+ }
+
+ // A position-independent representation of the stack pointer.
+ ptrdiff_t sp_top_delta() const {
+ ptrdiff_t result =
+ reinterpret_cast<intptr_t>(thread_local_.stack_pointer_) -
+ reinterpret_cast<intptr_t>(thread_local_.memory_top_);
+ DCHECK_LE(result, 0);
+ return result;
+ }
+
+ // Resets the buffer if it has grown beyond the default/minimum size and is
+ // empty.
+ void ResetIfEmpty() { thread_local_.ResetToStaticStackIfEmpty(this); }
+
+ // Whether the ThreadLocal storage has been invalidated.
+ bool IsValid() const { return thread_local_.memory_ != nullptr; }
+
+ ThreadLocal thread_local_;
+
+ friend class ExternalReference;
+ friend class RegExpStackScope;
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_STACK_H_
diff --git a/js/src/irregexp/imported/regexp.h b/js/src/irregexp/imported/regexp.h
new file mode 100644
index 0000000000..50269a4b71
--- /dev/null
+++ b/js/src/irregexp/imported/regexp.h
@@ -0,0 +1,236 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_H_
+#define V8_REGEXP_REGEXP_H_
+
+#include "irregexp/imported/regexp-error.h"
+#include "irregexp/RegExpShim.h"
+
+namespace v8 {
+namespace internal {
+
+class JSRegExp;
+class RegExpCapture;
+class RegExpMatchInfo;
+class RegExpNode;
+class RegExpTree;
+
+enum class RegExpCompilationTarget : int { kBytecode, kNative };
+
+// TODO(jgruber): Do not expose in regexp.h.
+// TODO(jgruber): Consider splitting between ParseData and CompileData.
+struct RegExpCompileData {
+ // The parsed AST as produced by the RegExpParser.
+ RegExpTree* tree = nullptr;
+
+ // The compiled Node graph as produced by RegExpTree::ToNode methods.
+ RegExpNode* node = nullptr;
+
+ // Either the generated code as produced by the compiler or a trampoline
+ // to the interpreter.
+ Handle<Object> code;
+
+ // True, iff the pattern is a 'simple' atom with zero captures. In other
+ // words, the pattern consists of a string with no metacharacters and special
+ // regexp features, and can be implemented as a standard string search.
+ bool simple = true;
+
+ // True, iff the pattern is anchored at the start of the string with '^'.
+ bool contains_anchor = false;
+
+ // Only set if the pattern contains named captures.
+ // Note: the lifetime equals that of the parse/compile zone.
+ ZoneVector<RegExpCapture*>* named_captures = nullptr;
+
+ // The error message. Only used if an error occurred during parsing or
+ // compilation.
+ RegExpError error = RegExpError::kNone;
+
+ // The position at which the error was detected. Only used if an
+ // error occurred.
+ int error_pos = 0;
+
+ // The number of capture groups, without the global capture \0.
+ int capture_count = 0;
+
+ // The number of registers used by the generated code.
+ int register_count = 0;
+
+ // The compilation target (bytecode or native code).
+ RegExpCompilationTarget compilation_target;
+};
+
+class RegExp final : public AllStatic {
+ public:
+ // Whether the irregexp engine generates interpreter bytecode.
+ static bool CanGenerateBytecode();
+
+ // Verify that the given flags combination is valid.
+ V8_EXPORT_PRIVATE static bool VerifyFlags(RegExpFlags flags);
+
+ // Verify the given pattern, i.e. check that parsing succeeds. If
+ // verification fails, `regexp_error_out` is set.
+ template <class CharT>
+ static bool VerifySyntax(Zone* zone, uintptr_t stack_limit,
+ const CharT* input, int input_length,
+ RegExpFlags flags, RegExpError* regexp_error_out,
+ const DisallowGarbageCollection& no_gc);
+
+ // Parses the RegExp pattern and prepares the JSRegExp object with
+ // generic data and choice of implementation - as well as what
+ // the implementation wants to store in the data field.
+ // Returns false if compilation fails.
+ V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
+ Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
+ RegExpFlags flags, uint32_t backtrack_limit);
+
+ // Ensures that a regexp is fully compiled and ready to be executed on a
+ // subject string. Returns true on success. Return false on failure, and
+ // then an exception will be pending.
+ V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate,
+ Handle<JSRegExp> re,
+ Handle<String> subject);
+
+ enum CallOrigin : int {
+ kFromRuntime = 0,
+ kFromJs = 1,
+ };
+
+ enum class ExecQuirks {
+ kNone,
+ // Used to work around an issue in the RegExpPrototypeSplit fast path,
+ // which diverges from the spec by not creating a sticky copy of the RegExp
+ // instance and calling `exec` in a loop. If called in this context, we
+ // must not update the last_match_info on a successful match at the subject
+ // string end. See crbug.com/1075514 for more information.
+ kTreatMatchAtEndAsFailure,
+ };
+
+ // See ECMA-262 section 15.10.6.2.
+ // This function calls the garbage collector if necessary.
+ V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
+ Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
+ int index, Handle<RegExpMatchInfo> last_match_info,
+ ExecQuirks exec_quirks = ExecQuirks::kNone);
+
+ V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object>
+ ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp,
+ Handle<String> subject, int index,
+ Handle<RegExpMatchInfo> last_match_info,
+ ExecQuirks exec_quirks = ExecQuirks::kNone);
+
+ // Integral return values used throughout regexp code layers.
+ static constexpr int kInternalRegExpFailure = 0;
+ static constexpr int kInternalRegExpSuccess = 1;
+ static constexpr int kInternalRegExpException = -1;
+ static constexpr int kInternalRegExpRetry = -2;
+ static constexpr int kInternalRegExpFallbackToExperimental = -3;
+ static constexpr int kInternalRegExpSmallestResult = -3;
+
+ enum IrregexpResult : int32_t {
+ RE_FAILURE = kInternalRegExpFailure,
+ RE_SUCCESS = kInternalRegExpSuccess,
+ RE_EXCEPTION = kInternalRegExpException,
+ RE_RETRY = kInternalRegExpRetry,
+ RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental,
+ };
+
+ // Set last match info. If match is nullptr, then setting captures is
+ // omitted.
+ static Handle<RegExpMatchInfo> SetLastMatchInfo(
+ Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
+ Handle<String> subject, int capture_count, int32_t* match);
+
+ V8_EXPORT_PRIVATE static bool CompileForTesting(
+ Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags,
+ Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte);
+
+ V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
+ RegExpNode* node);
+
+ static const int kRegExpTooLargeToOptimize = 20 * KB;
+
+ V8_WARN_UNUSED_RESULT
+ static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
+ Handle<JSRegExp> re,
+ RegExpFlags flags,
+ Handle<String> pattern,
+ RegExpError error);
+ static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
+ RegExpError error_text);
+
+ static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp);
+
+ static Handle<FixedArray> CreateCaptureNameMap(
+ Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures);
+};
+
+// Uses a special global mode of irregexp-generated code to perform a global
+// search and return multiple results at once. As such, this is essentially an
+// iterator over multiple results (retrieved batch-wise in advance).
+class RegExpGlobalCache final {
+ public:
+ RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject,
+ Isolate* isolate);
+
+ ~RegExpGlobalCache();
+
+ // Fetch the next entry in the cache for global regexp match results.
+ // This does not set the last match info. Upon failure, nullptr is
+ // returned. The cause can be checked with Result(). The previous result is
+ // still in available in memory when a failure happens.
+ int32_t* FetchNext();
+
+ int32_t* LastSuccessfulMatch();
+
+ bool HasException() { return num_matches_ < 0; }
+
+ private:
+ int AdvanceZeroLength(int last_index);
+
+ int num_matches_;
+ int max_matches_;
+ int current_match_index_;
+ int registers_per_match_;
+ // Pointer to the last set of captures.
+ int32_t* register_array_;
+ int register_array_size_;
+ Handle<JSRegExp> regexp_;
+ Handle<String> subject_;
+ Isolate* isolate_;
+};
+
+// Caches results for specific regexp queries on the isolate. At the time of
+// writing, this is used during global calls to RegExp.prototype.exec and
+// @@split.
+class RegExpResultsCache final : public AllStatic {
+ public:
+ enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
+
+ // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
+ // On success, the returned result is guaranteed to be a COW-array.
+ static Object Lookup(Heap* heap, String key_string, Object key_pattern,
+ FixedArray* last_match_out, ResultsCacheType type);
+ // Attempt to add value_array to the cache specified by type. On success,
+ // value_array is turned into a COW-array.
+ static void Enter(Isolate* isolate, Handle<String> key_string,
+ Handle<Object> key_pattern, Handle<FixedArray> value_array,
+ Handle<FixedArray> last_match_cache, ResultsCacheType type);
+ static void Clear(FixedArray cache);
+
+ static constexpr int kRegExpResultsCacheSize = 0x100;
+
+ private:
+ static constexpr int kStringOffset = 0;
+ static constexpr int kPatternOffset = 1;
+ static constexpr int kArrayOffset = 2;
+ static constexpr int kLastMatchOffset = 3;
+ static constexpr int kArrayEntriesPerCacheEntry = 4;
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_H_
diff --git a/js/src/irregexp/imported/special-case.cc b/js/src/irregexp/imported/special-case.cc
new file mode 100644
index 0000000000..f5a9928b3a
--- /dev/null
+++ b/js/src/irregexp/imported/special-case.cc
@@ -0,0 +1,111 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that
+// can be found in the LICENSE file.
+
+// Automatically generated by regexp/gen-regexp-special-case.cc
+
+// The following functions are used to build UnicodeSets
+// for special cases where the case-folding algorithm used by
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match
+// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime
+// Semantics: Canonicalize) step 3.
+
+#ifdef V8_INTL_SUPPORT
+#include "irregexp/imported/special-case.h"
+
+#include "unicode/uniset.h"
+namespace v8 {
+namespace internal {
+
+icu::UnicodeSet BuildIgnoreSet() {
+ icu::UnicodeSet set;
+ set.add(0xdf);
+ set.add(0x17f);
+ set.add(0x390);
+ set.add(0x3b0);
+ set.add(0x3f4);
+ set.add(0x1e9e);
+ set.add(0x1f80, 0x1faf);
+ set.add(0x1fb3);
+ set.add(0x1fbc);
+ set.add(0x1fc3);
+ set.add(0x1fcc);
+ set.add(0x1fd3);
+ set.add(0x1fe3);
+ set.add(0x1ff3);
+ set.add(0x1ffc);
+ set.add(0x2126);
+ set.add(0x212a, 0x212b);
+ set.add(0xfb05, 0xfb06);
+ set.freeze();
+ return set;
+}
+
+struct IgnoreSetData {
+ IgnoreSetData() : set(BuildIgnoreSet()) {}
+ const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() {
+ static base::LazyInstance<IgnoreSetData>::type set =
+ LAZY_INSTANCE_INITIALIZER;
+ return set.Pointer()->set;
+}
+
+icu::UnicodeSet BuildSpecialAddSet() {
+ icu::UnicodeSet set;
+ set.add(0x4b);
+ set.add(0x53);
+ set.add(0x6b);
+ set.add(0x73);
+ set.add(0xc5);
+ set.add(0xe5);
+ set.add(0x398);
+ set.add(0x3a9);
+ set.add(0x3b8);
+ set.add(0x3c9);
+ set.add(0x3d1);
+ set.freeze();
+ return set;
+}
+
+struct SpecialAddSetData {
+ SpecialAddSetData() : set(BuildSpecialAddSet()) {}
+ const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() {
+ static base::LazyInstance<SpecialAddSetData>::type set =
+ LAZY_INSTANCE_INITIALIZER;
+ return set.Pointer()->set;
+}
+
+icu::UnicodeSet BuildUnicodeNonSimpleCloseOverSet() {
+ icu::UnicodeSet set;
+ set.add(0x390);
+ set.add(0x3b0);
+ set.add(0x1fd3);
+ set.add(0x1fe3);
+ set.add(0xfb05, 0xfb06);
+ set.freeze();
+ return set;
+}
+
+struct UnicodeNonSimpleCloseOverSetData {
+ UnicodeNonSimpleCloseOverSetData() : set(BuildUnicodeNonSimpleCloseOverSet()) {}
+ const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::UnicodeNonSimpleCloseOverSet() {
+ static base::LazyInstance<UnicodeNonSimpleCloseOverSetData>::type set =
+ LAZY_INSTANCE_INITIALIZER;
+ return set.Pointer()->set;
+}
+
+
+} // namespace internal
+} // namespace v8
+#endif // V8_INTL_SUPPORT
diff --git a/js/src/irregexp/imported/special-case.h b/js/src/irregexp/imported/special-case.h
new file mode 100644
index 0000000000..ea511af5a4
--- /dev/null
+++ b/js/src/irregexp/imported/special-case.h
@@ -0,0 +1,127 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_SPECIAL_CASE_H_
+#define V8_REGEXP_SPECIAL_CASE_H_
+
+#ifdef V8_INTL_SUPPORT
+#include "irregexp/RegExpShim.h"
+
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+
+namespace v8 {
+namespace internal {
+
+// Sets of Unicode characters that need special handling under "i" mode
+
+// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
+// defines slightly different case-folding rules than Unicode. An
+// input character should match a pattern character if the result of
+// the Canonicalize algorithm is the same for both characters.
+//
+// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
+// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
+// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
+// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
+// the precise definition.
+//
+// While compiling such regular expressions, we need to compute the
+// set of characters that should match a given input character. (See
+// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
+// For almost all characters, this can be efficiently computed using
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
+// the remaining special cases.
+//
+// For a character c, the rules are as follows:
+//
+// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
+// containing c will produce the set of characters that should
+// match /c/i (or /[c]/i), and only those characters.
+//
+// 2. If c is in IgnoreSet, then the only character it should match is
+// itself. However, closeOver will add additional incorrect
+// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
+// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
+// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
+// itself, and should not match 'ẞ'. In these cases, we can skip
+// the closeOver entirely, because it will never add an equivalent
+// character.
+//
+// 3. If c is in SpecialAddSet, then it should match at least one
+// character other than itself. However, closeOver will add at
+// least one additional incorrect match. For example, consider the
+// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
+// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
+// SIGN should not match either of the other two characters. As a
+// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
+// IgnoreSet). To find the correct matches for characters in
+// SpecialAddSet, we closeOver the original character, but filter
+// out the results that do not have the same canonical value.
+//
+// The contents of these sets are calculated at build time by
+// src/regexp/gen-regexp-special-case.cc, which generates
+// gen/src/regexp/special-case.cc. This is done by iterating over the
+// result of closeOver for each BMP character, and finding sets for
+// which at least one character has a different canonical value than
+// another character. Characters that match no other characters in
+// their equivalence class are added to IgnoreSet. Characters that
+// match at least one other character are added to SpecialAddSet.
+//
+// For unicode ignoreCase ("iu" and "iv"),
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in
+// the same equivalence class. This includes characaters that are in the same
+// equivalence class using full case folding. According to the spec, only
+// simple case folding shall be considered. We therefore create
+// UnicodeNonSimpleCloseOverSet containing all characters for which
+// UnicodeSet::closeOver adds characters that are not simple case folds. This
+// set should be used similar to IgnoreSet described above.
+
+class RegExpCaseFolding final : public AllStatic {
+ public:
+ static const icu::UnicodeSet& IgnoreSet();
+ static const icu::UnicodeSet& SpecialAddSet();
+ static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet();
+
+ // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
+ // Canonicalize) step 3, which is used to determine whether
+ // characters match when ignoreCase is true and unicode is false.
+ static UChar32 Canonicalize(UChar32 ch) {
+ // a. Assert: ch is a UTF-16 code unit.
+ CHECK_LE(ch, 0xffff);
+
+ // b. Let s be the String value consisting of the single code unit ch.
+ icu::UnicodeString s(ch);
+
+ // c. Let u be the same result produced as if by performing the algorithm
+ // for String.prototype.toUpperCase using s as the this value.
+ // d. Assert: Type(u) is String.
+ icu::UnicodeString& u = s.toUpper();
+
+ // e. If u does not consist of a single code unit, return ch.
+ if (u.length() != 1) {
+ return ch;
+ }
+
+ // f. Let cu be u's single code unit element.
+ UChar32 cu = u.char32At(0);
+
+ // g. If the value of ch >= 128 and the value of cu < 128, return ch.
+ if (ch >= 128 && cu < 128) {
+ return ch;
+ }
+
+ // h. Return cu.
+ return cu;
+ }
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_INTL_SUPPORT
+
+#endif // V8_REGEXP_SPECIAL_CASE_H_