diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /js/src/irregexp | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
49 files changed, 26159 insertions, 0 deletions
diff --git a/js/src/irregexp/LICENSE.v8 b/js/src/irregexp/LICENSE.v8 new file mode 100644 index 0000000000..933718a9ef --- /dev/null +++ b/js/src/irregexp/LICENSE.v8 @@ -0,0 +1,26 @@ +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/js/src/irregexp/RegExpAPI.cpp b/js/src/irregexp/RegExpAPI.cpp new file mode 100644 index 0000000000..a008d454f1 --- /dev/null +++ b/js/src/irregexp/RegExpAPI.cpp @@ -0,0 +1,916 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/RegExpAPI.h" + +#include "mozilla/ArrayUtils.h" +#include "mozilla/Casting.h" + +#include "frontend/FrontendContext.h" // AutoReportFrontendContext +#include "frontend/TokenStream.h" +#include "gc/GC.h" +#include "gc/Zone.h" +#include "irregexp/imported/regexp-ast.h" +#include "irregexp/imported/regexp-bytecode-generator.h" +#include "irregexp/imported/regexp-compiler.h" +#include "irregexp/imported/regexp-interpreter.h" +#include "irregexp/imported/regexp-macro-assembler-arch.h" +#include "irregexp/imported/regexp-macro-assembler-tracer.h" +#include "irregexp/imported/regexp-parser.h" +#include "irregexp/imported/regexp-stack.h" +#include "irregexp/imported/regexp.h" +#include "irregexp/RegExpNativeMacroAssembler.h" +#include "irregexp/RegExpShim.h" +#include "jit/JitCommon.h" +#include "js/friend/ErrorMessages.h" // JSMSG_* +#include "js/friend/StackLimits.h" // js::ReportOverRecursed +#include "util/StringBuffer.h" +#include "vm/MatchPairs.h" +#include "vm/PlainObject.h" +#include "vm/RegExpShared.h" + +namespace js { +namespace irregexp { + +using mozilla::AssertedCast; +using mozilla::Maybe; +using mozilla::Nothing; +using mozilla::PointerRangeSize; +using mozilla::Some; + +using frontend::DummyTokenStream; +using frontend::TokenStreamAnyChars; + +using v8::internal::DisallowGarbageCollection; +using v8::internal::HandleScope; +using v8::internal::InputOutputData; +using v8::internal::IrregexpInterpreter; +using v8::internal::NativeRegExpMacroAssembler; +using v8::internal::RegExpBytecodeGenerator; +using v8::internal::RegExpCapture; +using v8::internal::RegExpCompileData; +using v8::internal::RegExpCompiler; +using v8::internal::RegExpError; +using v8::internal::RegExpMacroAssembler; +using v8::internal::RegExpMacroAssemblerTracer; +using v8::internal::RegExpNode; +using v8::internal::RegExpParser; +using v8::internal::SMRegExpMacroAssembler; +using v8::internal::Zone; +using v8::internal::ZoneVector; + +using V8HandleString = v8::internal::Handle<v8::internal::String>; +using V8HandleRegExp = v8::internal::Handle<v8::internal::JSRegExp>; + +using namespace v8::internal::regexp_compiler_constants; + +static uint32_t ErrorNumber(RegExpError err) { + switch (err) { + case RegExpError::kNone: + return JSMSG_NOT_AN_ERROR; + case RegExpError::kStackOverflow: + return JSMSG_OVER_RECURSED; + case RegExpError::kAnalysisStackOverflow: + return JSMSG_OVER_RECURSED; + case RegExpError::kTooLarge: + return JSMSG_TOO_MANY_PARENS; + case RegExpError::kUnterminatedGroup: + return JSMSG_MISSING_PAREN; + case RegExpError::kUnmatchedParen: + return JSMSG_UNMATCHED_RIGHT_PAREN; + case RegExpError::kEscapeAtEndOfPattern: + return JSMSG_ESCAPE_AT_END_OF_REGEXP; + case RegExpError::kInvalidPropertyName: + return JSMSG_INVALID_PROPERTY_NAME; + case RegExpError::kInvalidEscape: + return JSMSG_INVALID_IDENTITY_ESCAPE; + case RegExpError::kInvalidDecimalEscape: + return JSMSG_INVALID_DECIMAL_ESCAPE; + case RegExpError::kInvalidUnicodeEscape: + return JSMSG_INVALID_UNICODE_ESCAPE; + case RegExpError::kNothingToRepeat: + return JSMSG_NOTHING_TO_REPEAT; + case RegExpError::kLoneQuantifierBrackets: + // Note: V8 reports the same error for both ']' and '}'. + return JSMSG_RAW_BRACKET_IN_REGEXP; + case RegExpError::kRangeOutOfOrder: + return JSMSG_NUMBERS_OUT_OF_ORDER; + case RegExpError::kIncompleteQuantifier: + return JSMSG_INCOMPLETE_QUANTIFIER; + case RegExpError::kInvalidQuantifier: + return JSMSG_INVALID_QUANTIFIER; + case RegExpError::kInvalidGroup: + return JSMSG_INVALID_GROUP; + case RegExpError::kMultipleFlagDashes: + case RegExpError::kRepeatedFlag: + case RegExpError::kInvalidFlagGroup: + // V8 contains experimental support for turning regexp flags on + // and off in the middle of a regular expression. Unless it + // becomes standardized, SM does not support this feature. + MOZ_CRASH("Mode modifiers not supported"); + case RegExpError::kNotLinear: + // V8 has an experimental non-backtracking engine. We do not + // support it yet. + MOZ_CRASH("Non-backtracking execution not supported"); + case RegExpError::kTooManyCaptures: + return JSMSG_TOO_MANY_PARENS; + case RegExpError::kInvalidCaptureGroupName: + return JSMSG_INVALID_CAPTURE_NAME; + case RegExpError::kDuplicateCaptureGroupName: + return JSMSG_DUPLICATE_CAPTURE_NAME; + case RegExpError::kInvalidNamedReference: + return JSMSG_INVALID_NAMED_REF; + case RegExpError::kInvalidNamedCaptureReference: + return JSMSG_INVALID_NAMED_CAPTURE_REF; + case RegExpError::kInvalidClassEscape: + return JSMSG_RANGE_WITH_CLASS_ESCAPE; + case RegExpError::kInvalidClassPropertyName: + return JSMSG_INVALID_CLASS_PROPERTY_NAME; + case RegExpError::kInvalidCharacterClass: + return JSMSG_RANGE_WITH_CLASS_ESCAPE; + case RegExpError::kUnterminatedCharacterClass: + return JSMSG_UNTERM_CLASS; + case RegExpError::kOutOfOrderCharacterClass: + return JSMSG_BAD_CLASS_RANGE; + + case RegExpError::kInvalidClassSetOperation: + case RegExpError::kInvalidCharacterInClass: + case RegExpError::kNegatedCharacterClassWithStrings: + // TODO: implement support for /v flag (bug 1713657) + MOZ_CRASH("Unicode sets not supported"); + + case RegExpError::NumErrors: + MOZ_CRASH("Unreachable"); + } + MOZ_CRASH("Unreachable"); +} + +Isolate* CreateIsolate(JSContext* cx) { + auto isolate = MakeUnique<Isolate>(cx); + if (!isolate || !isolate->init()) { + return nullptr; + } + return isolate.release(); +} + +void TraceIsolate(JSTracer* trc, Isolate* isolate) { isolate->trace(trc); } + +void DestroyIsolate(Isolate* isolate) { + MOZ_ASSERT(isolate->liveHandles() == 0); + MOZ_ASSERT(isolate->livePseudoHandles() == 0); + js_delete(isolate); +} + +size_t IsolateSizeOfIncludingThis(Isolate* isolate, + mozilla::MallocSizeOf mallocSizeOf) { + return isolate->sizeOfIncludingThis(mallocSizeOf); +} + +static size_t ComputeColumn(const Latin1Char* begin, const Latin1Char* end) { + return PointerRangeSize(begin, end); +} + +static size_t ComputeColumn(const char16_t* begin, const char16_t* end) { + return unicode::CountCodePoints(begin, end); +} + +// This function is varargs purely so it can call ReportCompileErrorLatin1. +// We never call it with additional arguments. +template <typename CharT> +static void ReportSyntaxError(TokenStreamAnyChars& ts, + mozilla::Maybe<uint32_t> line, + mozilla::Maybe<uint32_t> column, + RegExpCompileData& result, CharT* start, + size_t length, ...) { + MOZ_ASSERT(line.isSome() == column.isSome()); + + Maybe<gc::AutoSuppressGC> suppressGC; + if (JSContext* maybeCx = ts.context()->maybeCurrentJSContext()) { + suppressGC.emplace(maybeCx); + } + uint32_t errorNumber = ErrorNumber(result.error); + + if (errorNumber == JSMSG_OVER_RECURSED) { + ReportOverRecursed(ts.context()); + return; + } + + uint32_t offset = std::max(result.error_pos, 0); + MOZ_ASSERT(offset <= length); + + ErrorMetadata err; + + // Ordinarily this indicates whether line-of-context information can be + // added, but we entirely ignore that here because we create a + // a line of context based on the expression source. + uint32_t location = ts.currentToken().pos.begin; + if (ts.fillExceptingContext(&err, location)) { + uint32_t columnNumber = + AssertedCast<uint32_t>(ComputeColumn(start, start + offset)); + if (line.isSome()) { + // If this pattern is being checked by the frontend Parser instead + // of other API entry points like |new RegExp|, then the parser will + // have provided both a line and column pointing at the *beginning* + // of the RegExp literal inside the source text. + // We adjust the columnNumber to point to the actual syntax error + // inside the literal. + err.lineNumber = *line; + err.columnNumber = *column + columnNumber; + } else { + // Line breaks are not significant in pattern text in the same way as + // in source text, so act as though pattern text is a single line, then + // compute a column based on "code point" count (treating a lone + // surrogate as a "code point" in UTF-16). Gak. + err.lineNumber = 1; + err.columnNumber = columnNumber; + } + } + + // For most error reporting, the line of context derives from the token + // stream. So when location information doesn't come from the token + // stream, we can't give a line of context. But here the "line of context" + // can be (and is) derived from the pattern text, so we can provide it no + // matter if the location is derived from the caller. + + const CharT* windowStart = + (offset > ErrorMetadata::lineOfContextRadius) + ? start + (offset - ErrorMetadata::lineOfContextRadius) + : start; + + const CharT* windowEnd = + (length - offset > ErrorMetadata::lineOfContextRadius) + ? start + offset + ErrorMetadata::lineOfContextRadius + : start + length; + + size_t windowLength = PointerRangeSize(windowStart, windowEnd); + MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2); + + // Create the windowed string, not including the potential line + // terminator. + StringBuffer windowBuf(ts.context()); + if (!windowBuf.append(windowStart, windowEnd)) { + return; + } + + // The line of context must be null-terminated, and StringBuffer doesn't + // make that happen unless we force it to. + if (!windowBuf.append('\0')) { + return; + } + + err.lineOfContext.reset(windowBuf.stealChars()); + if (!err.lineOfContext) { + return; + } + + err.lineLength = windowLength; + err.tokenOffset = offset - (windowStart - start); + + va_list args; + va_start(args, length); + ReportCompileErrorLatin1(ts.context(), std::move(err), nullptr, errorNumber, + &args); + va_end(args); +} + +static void ReportSyntaxError(TokenStreamAnyChars& ts, + RegExpCompileData& result, + Handle<JSAtom*> pattern) { + JS::AutoCheckCannotGC nogc_; + if (pattern->hasLatin1Chars()) { + ReportSyntaxError(ts, Nothing(), Nothing(), result, + pattern->latin1Chars(nogc_), pattern->length()); + } else { + ReportSyntaxError(ts, Nothing(), Nothing(), result, + pattern->twoByteChars(nogc_), pattern->length()); + } +} + +template <typename CharT> +static bool CheckPatternSyntaxImpl(js::LifoAlloc& alloc, + JS::NativeStackLimit stackLimit, + const CharT* input, uint32_t inputLength, + JS::RegExpFlags flags, + RegExpCompileData* result, + JS::AutoAssertNoGC& nogc) { + LifoAllocScope allocScope(&alloc); + Zone zone(allocScope.alloc()); + + return RegExpParser::VerifyRegExpSyntax(&zone, stackLimit, input, inputLength, + flags, result, nogc); +} + +bool CheckPatternSyntax(js::LifoAlloc& alloc, JS::NativeStackLimit stackLimit, + TokenStreamAnyChars& ts, + const mozilla::Range<const char16_t> chars, + JS::RegExpFlags flags, mozilla::Maybe<uint32_t> line, + mozilla::Maybe<uint32_t> column) { + RegExpCompileData result; + JS::AutoAssertNoGC nogc; + if (!CheckPatternSyntaxImpl(alloc, stackLimit, chars.begin().get(), + chars.length(), flags, &result, nogc)) { + ReportSyntaxError(ts, line, column, result, chars.begin().get(), + chars.length()); + return false; + } + return true; +} + +bool CheckPatternSyntax(JSContext* cx, JS::NativeStackLimit stackLimit, + TokenStreamAnyChars& ts, Handle<JSAtom*> pattern, + JS::RegExpFlags flags) { + RegExpCompileData result; + JS::AutoAssertNoGC nogc(cx); + if (pattern->hasLatin1Chars()) { + if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit, + pattern->latin1Chars(nogc), pattern->length(), + flags, &result, nogc)) { + ReportSyntaxError(ts, result, pattern); + return false; + } + return true; + } + if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit, + pattern->twoByteChars(nogc), pattern->length(), + flags, &result, nogc)) { + ReportSyntaxError(ts, result, pattern); + return false; + } + return true; +} + +// A regexp is a good candidate for Boyer-Moore if it has at least 3 +// times as many characters as it has unique characters. Note that +// table lookups in irregexp are done modulo tableSize (128). +template <typename CharT> +static bool HasFewDifferentCharacters(const CharT* chars, size_t length) { + const uint32_t tableSize = + v8::internal::NativeRegExpMacroAssembler::kTableSize; + bool character_found[tableSize] = {}; + uint32_t different = 0; + for (uint32_t i = 0; i < length; i++) { + uint32_t ch = chars[i] % tableSize; + if (!character_found[ch]) { + character_found[ch] = true; + different++; + // We declare a regexp low-alphabet if it has at least 3 times as many + // characters as it has different characters. + if (different * 3 > length) { + return false; + } + } + } + return true; +} + +// Identifies the sort of pattern where Boyer-Moore is faster than string search +static bool UseBoyerMoore(Handle<JSAtom*> pattern, JS::AutoAssertNoGC& nogc) { + size_t length = + std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length()); + if (length <= kPatternTooShortForBoyerMoore) { + return false; + } + + if (pattern->hasLatin1Chars()) { + return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length); + } + MOZ_ASSERT(pattern->hasTwoByteChars()); + return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length); +} + +// Sample character frequency information for use in Boyer-Moore. +static void SampleCharacters(Handle<JSLinearString*> sample_subject, + RegExpCompiler& compiler) { + static const int kSampleSize = 128; + int chars_sampled = 0; + + int length = sample_subject->length(); + + int half_way = (length - kSampleSize) / 2; + for (int i = std::max(0, half_way); i < length && chars_sampled < kSampleSize; + i++, chars_sampled++) { + compiler.frequency_collator()->CountCharacter( + sample_subject->latin1OrTwoByteChar(i)); + } +} + +// Recursively walking the AST for a deeply nested regexp (like +// `/(a(a(a(a(a(a(a(...(a)...))))))))/`) may overflow the stack while +// compiling. To avoid this, we use V8's implementation of the Visitor +// pattern to walk the AST first with an overly large stack frame. +class RegExpDepthCheck final : public v8::internal::RegExpVisitor { + public: + explicit RegExpDepthCheck(JSContext* cx) : cx_(cx) {} + + bool check(v8::internal::RegExpTree* root) { + return !!root->Accept(this, nullptr); + } + + // Leaf nodes with no children +#define LEAF_DEPTH(Kind) \ + void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \ + uint8_t padding[FRAME_PADDING]; \ + dummy_ = padding; /* Prevent padding from being optimized away.*/ \ + AutoCheckRecursionLimit recursion(cx_); \ + return (void*)recursion.checkDontReport(cx_); \ + } + + LEAF_DEPTH(Assertion) + LEAF_DEPTH(Atom) + LEAF_DEPTH(BackReference) + LEAF_DEPTH(ClassSetOperand) + LEAF_DEPTH(ClassRanges) + LEAF_DEPTH(Empty) + LEAF_DEPTH(Text) +#undef LEAF_DEPTH + + // Wrapper nodes with one child +#define WRAPPER_DEPTH(Kind) \ + void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \ + uint8_t padding[FRAME_PADDING]; \ + dummy_ = padding; /* Prevent padding from being optimized away.*/ \ + AutoCheckRecursionLimit recursion(cx_); \ + if (!recursion.checkDontReport(cx_)) { \ + return nullptr; \ + } \ + return node->body()->Accept(this, nullptr); \ + } + + WRAPPER_DEPTH(Capture) + WRAPPER_DEPTH(Group) + WRAPPER_DEPTH(Lookaround) + WRAPPER_DEPTH(Quantifier) +#undef WRAPPER_DEPTH + + void* VisitAlternative(v8::internal::RegExpAlternative* node, + void*) override { + uint8_t padding[FRAME_PADDING]; + dummy_ = padding; /* Prevent padding from being optimized away.*/ + AutoCheckRecursionLimit recursion(cx_); + if (!recursion.checkDontReport(cx_)) { + return nullptr; + } + for (auto* child : *node->nodes()) { + if (!child->Accept(this, nullptr)) { + return nullptr; + } + } + return (void*)true; + } + void* VisitDisjunction(v8::internal::RegExpDisjunction* node, + void*) override { + uint8_t padding[FRAME_PADDING]; + dummy_ = padding; /* Prevent padding from being optimized away.*/ + AutoCheckRecursionLimit recursion(cx_); + if (!recursion.checkDontReport(cx_)) { + return nullptr; + } + for (auto* child : *node->alternatives()) { + if (!child->Accept(this, nullptr)) { + return nullptr; + } + } + return (void*)true; + } + void* VisitClassSetExpression(v8::internal::RegExpClassSetExpression* node, + void*) override { + uint8_t padding[FRAME_PADDING]; + dummy_ = padding; /* Prevent padding from being optimized away.*/ + AutoCheckRecursionLimit recursion(cx_); + if (!recursion.checkDontReport(cx_)) { + return nullptr; + } + for (auto* child : *node->operands()) { + if (!child->Accept(this, nullptr)) { + return nullptr; + } + } + return (void*)true; + } + + private: + JSContext* cx_; + void* dummy_ = nullptr; + + // This size is picked to be comfortably larger than any + // RegExp*::ToNode stack frame. + static const size_t FRAME_PADDING = 256; +}; + +enum class AssembleResult { + Success, + TooLarge, + OutOfMemory, +}; + +[[nodiscard]] static AssembleResult Assemble( + JSContext* cx, RegExpCompiler* compiler, RegExpCompileData* data, + MutableHandleRegExpShared re, Handle<JSAtom*> pattern, Zone* zone, + bool useNativeCode, bool isLatin1) { + // Because we create a StackMacroAssembler, this function is not allowed + // to GC. If needed, we allocate and throw errors in the caller. + jit::TempAllocator temp(&cx->tempLifoAlloc()); + Maybe<jit::JitContext> jctx; + Maybe<js::jit::StackMacroAssembler> stack_masm; + UniquePtr<RegExpMacroAssembler> masm; + if (useNativeCode) { + NativeRegExpMacroAssembler::Mode mode = + isLatin1 ? NativeRegExpMacroAssembler::LATIN1 + : NativeRegExpMacroAssembler::UC16; + // If we are compiling native code, we need a macroassembler, + // which needs a jit context. + jctx.emplace(cx); + stack_masm.emplace(cx, temp); +#ifdef DEBUG + // It would be much preferable to use `class AutoCreatedBy` here, but we + // may be operating without an assembler at all if `useNativeCode` is + // `false`, so there's no place to put such a call. + stack_masm.ref().pushCreator("Assemble() in RegExpAPI.cpp"); +#endif + uint32_t num_capture_registers = re->pairCount() * 2; + masm = MakeUnique<SMRegExpMacroAssembler>(cx, stack_masm.ref(), zone, mode, + num_capture_registers); + } else { + masm = MakeUnique<RegExpBytecodeGenerator>(cx->isolate, zone); + } + if (!masm) { + ReportOutOfMemory(cx); + return AssembleResult::OutOfMemory; + } + + bool isLargePattern = + pattern->length() > v8::internal::RegExp::kRegExpTooLargeToOptimize; + masm->set_slow_safe(isLargePattern); + if (compiler->optimize()) { + compiler->set_optimize(!isLargePattern); + } + + // When matching a regexp with known maximum length that is anchored + // at the end, we may be able to skip the beginning of long input + // strings. This decision is made here because it depends on + // information in the AST that isn't replicated in the Node + // structure used inside the compiler. + bool is_start_anchored = data->tree->IsAnchoredAtStart(); + bool is_end_anchored = data->tree->IsAnchoredAtEnd(); + int max_length = data->tree->max_match(); + static const int kMaxBacksearchLimit = 1024; + if (is_end_anchored && !is_start_anchored && !re->sticky() && + max_length < kMaxBacksearchLimit) { + masm->SetCurrentPositionFromEnd(max_length); + } + + if (re->global()) { + RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; + if (data->tree->min_match() > 0) { + mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; + } else if (re->unicode()) { + mode = RegExpMacroAssembler::GLOBAL_UNICODE; + } + masm->set_global_mode(mode); + } + + // The masm tracer works as a thin wrapper around another macroassembler. + RegExpMacroAssembler* masm_ptr = masm.get(); +#ifdef DEBUG + UniquePtr<RegExpMacroAssembler> tracer_masm; + if (jit::JitOptions.trace_regexp_assembler) { + tracer_masm = MakeUnique<RegExpMacroAssemblerTracer>(cx->isolate, masm_ptr); + masm_ptr = tracer_masm.get(); + } +#endif + + // Compile the regexp. + V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate); + RegExpCompiler::CompilationResult result = compiler->Assemble( + cx->isolate, masm_ptr, data->node, data->capture_count, wrappedPattern); + + if (useNativeCode) { +#ifdef DEBUG + // See comment referencing `pushCreator` above. + stack_masm.ref().popCreator(); +#endif + } + + if (!result.Succeeded()) { + MOZ_ASSERT(result.error == RegExpError::kTooLarge); + return AssembleResult::TooLarge; + } + if (result.code->value().isUndefined()) { + // SMRegExpMacroAssembler::GetCode returns undefined on OOM. + MOZ_ASSERT(useNativeCode); + return AssembleResult::OutOfMemory; + } + + re->updateMaxRegisters(result.num_registers); + if (useNativeCode) { + // Transfer ownership of the tables from the macroassembler to the + // RegExpShared. + SMRegExpMacroAssembler::TableVector& tables = + static_cast<SMRegExpMacroAssembler*>(masm.get())->tables(); + for (uint32_t i = 0; i < tables.length(); i++) { + if (!re->addTable(std::move(tables[i]))) { + ReportOutOfMemory(cx); + return AssembleResult::OutOfMemory; + } + } + re->setJitCode(v8::internal::Code::cast(*result.code).inner(), isLatin1); + } else { + // Transfer ownership of the bytecode from the HandleScope to the + // RegExpShared. + ByteArray bytecode = + v8::internal::ByteArray::cast(*result.code).takeOwnership(cx->isolate); + uint32_t length = bytecode->length; + re->setByteCode(bytecode.release(), isLatin1); + js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode); + } + + return AssembleResult::Success; +} + +struct RegExpCaptureIndexLess { + bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { + return lhs->index() < rhs->index(); + } +}; + +bool InitializeNamedCaptures(JSContext* cx, HandleRegExpShared re, + ZoneVector<RegExpCapture*>* namedCaptures) { + // The irregexp parser returns named capture information in the form + // of a ZoneVector of RegExpCaptures nodes, each of which stores the + // capture name and the corresponding capture index. We create a + // template object with a property for each capture name, and store + // the capture indices as a heap-allocated array. + uint32_t numNamedCaptures = namedCaptures->size(); + + // Named captures are sorted by name (because the set is used to ensure + // name uniqueness). But the capture name map must be sorted by index. + std::sort(namedCaptures->begin(), namedCaptures->end(), + RegExpCaptureIndexLess{}); + + // Create a plain template object. + Rooted<js::PlainObject*> templateObject( + cx, js::NewPlainObjectWithProto(cx, nullptr, TenuredObject)); + if (!templateObject) { + return false; + } + + // Allocate the capture index array. + uint32_t arraySize = numNamedCaptures * sizeof(uint32_t); + UniquePtr<uint32_t[], JS::FreePolicy> captureIndices( + static_cast<uint32_t*>(js_malloc(arraySize))); + if (!captureIndices) { + js::ReportOutOfMemory(cx); + return false; + } + + // Initialize the properties of the template and populate the + // capture index array. + RootedId id(cx); + RootedValue dummyString(cx, StringValue(cx->runtime()->emptyString)); + for (uint32_t i = 0; i < numNamedCaptures; i++) { + RegExpCapture* capture = (*namedCaptures)[i]; + JSAtom* name = + js::AtomizeChars(cx, capture->name()->data(), capture->name()->size()); + if (!name) { + return false; + } + id = NameToId(name->asPropertyName()); + if (!NativeDefineDataProperty(cx, templateObject, id, dummyString, + JSPROP_ENUMERATE)) { + return false; + } + captureIndices[i] = capture->index(); + } + + RegExpShared::InitializeNamedCaptures( + cx, re, numNamedCaptures, templateObject, captureIndices.release()); + return true; +} + +bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, + RegExpShared::CodeKind codeKind) { + Rooted<JSAtom*> pattern(cx, re->getSource()); + JS::RegExpFlags flags = re->getFlags(); + LifoAllocScope allocScope(&cx->tempLifoAlloc()); + HandleScope handleScope(cx->isolate); + Zone zone(allocScope.alloc()); + + RegExpCompileData data; + { + V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate); + if (!RegExpParser::ParseRegExpFromHeapString( + cx->isolate, &zone, wrappedPattern, flags, &data)) { + AutoReportFrontendContext fc(cx); + JS::CompileOptions options(cx); + DummyTokenStream dummyTokenStream(&fc, options); + ReportSyntaxError(dummyTokenStream, data, pattern); + return false; + } + } + + // Avoid stack overflow while recursively walking the AST. + RegExpDepthCheck depthCheck(cx); + if (!depthCheck.check(data.tree)) { + JS_ReportErrorASCII(cx, "regexp too big"); + cx->reportResourceExhaustion(); + return false; + } + + if (re->kind() == RegExpShared::Kind::Unparsed) { + // This is the first time we have compiled this regexp. + // First, check to see if we should use simple string search + // with an atom. + if (!flags.ignoreCase() && !flags.sticky()) { + Rooted<JSAtom*> searchAtom(cx); + if (data.simple) { + // The parse-tree is a single atom that is equal to the pattern. + searchAtom = re->getSource(); + } else if (data.tree->IsAtom() && data.capture_count == 0) { + // The parse-tree is a single atom that is not equal to the pattern. + v8::internal::RegExpAtom* atom = data.tree->AsAtom(); + const char16_t* twoByteChars = atom->data().begin(); + searchAtom = AtomizeChars(cx, twoByteChars, atom->length()); + if (!searchAtom) { + return false; + } + } + JS::AutoAssertNoGC nogc(cx); + if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) { + re->useAtomMatch(searchAtom); + return true; + } + } + if (data.named_captures) { + if (!InitializeNamedCaptures(cx, re, data.named_captures)) { + return false; + } + } + // All fallible initialization has succeeded, so we can change state. + // Add one to capture_count to account for the whole-match capture. + uint32_t pairCount = data.capture_count + 1; + re->useRegExpMatch(pairCount); + } + + MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp); + + RegExpCompiler compiler(cx->isolate, &zone, data.capture_count, flags, + input->hasLatin1Chars()); + + bool isLatin1 = input->hasLatin1Chars(); + + SampleCharacters(input, compiler); + data.node = compiler.PreprocessRegExp(&data, flags, isLatin1); + data.error = AnalyzeRegExp(cx->isolate, isLatin1, flags, data.node); + if (data.error != RegExpError::kNone) { + MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow); + ReportOverRecursed(cx); + return false; + } + + bool useNativeCode = codeKind == RegExpShared::CodeKind::Jitcode; + MOZ_ASSERT_IF(useNativeCode, IsNativeRegExpEnabled()); + + switch (Assemble(cx, &compiler, &data, re, pattern, &zone, useNativeCode, + isLatin1)) { + case AssembleResult::TooLarge: + JS_ReportErrorASCII(cx, "regexp too big"); + cx->reportResourceExhaustion(); + return false; + case AssembleResult::OutOfMemory: + MOZ_ASSERT(cx->isThrowingOutOfMemory()); + return false; + case AssembleResult::Success: + break; + } + return true; +} + +template <typename CharT> +RegExpRunStatus ExecuteRaw(jit::JitCode* code, const CharT* chars, + size_t length, size_t startIndex, + VectorMatchPairs* matches) { + InputOutputData data(chars, chars + length, startIndex, matches); + + static_assert(RegExpRunStatus_Error == + v8::internal::RegExp::kInternalRegExpException); + static_assert(RegExpRunStatus_Success == + v8::internal::RegExp::kInternalRegExpSuccess); + static_assert(RegExpRunStatus_Success_NotFound == + v8::internal::RegExp::kInternalRegExpFailure); + + typedef int (*RegExpCodeSignature)(InputOutputData*); + auto function = reinterpret_cast<RegExpCodeSignature>(code->raw()); + { + JS::AutoSuppressGCAnalysis nogc; + return (RegExpRunStatus)CALL_GENERATED_1(function, &data); + } +} + +RegExpRunStatus Interpret(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, size_t startIndex, + VectorMatchPairs* matches) { + MOZ_ASSERT(re->getByteCode(input->hasLatin1Chars())); + + HandleScope handleScope(cx->isolate); + V8HandleRegExp wrappedRegExp(v8::internal::JSRegExp(re), cx->isolate); + V8HandleString wrappedInput(v8::internal::String(input), cx->isolate); + + static_assert(RegExpRunStatus_Error == + v8::internal::RegExp::kInternalRegExpException); + static_assert(RegExpRunStatus_Success == + v8::internal::RegExp::kInternalRegExpSuccess); + static_assert(RegExpRunStatus_Success_NotFound == + v8::internal::RegExp::kInternalRegExpFailure); + + RegExpRunStatus status = + (RegExpRunStatus)IrregexpInterpreter::MatchForCallFromRuntime( + cx->isolate, wrappedRegExp, wrappedInput, matches->pairsRaw(), + uint32_t(matches->pairCount() * 2), uint32_t(startIndex)); + + MOZ_ASSERT(status == RegExpRunStatus_Error || + status == RegExpRunStatus_Success || + status == RegExpRunStatus_Success_NotFound); + + return status; +} + +RegExpRunStatus Execute(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, size_t startIndex, + VectorMatchPairs* matches) { + bool latin1 = input->hasLatin1Chars(); + jit::JitCode* jitCode = re->getJitCode(latin1); + bool isCompiled = !!jitCode; + + // Reset the Irregexp backtrack stack if it grows during execution. + irregexp::RegExpStackScope stackScope(cx->isolate); + + if (isCompiled) { + JS::AutoCheckCannotGC nogc; + if (latin1) { + return ExecuteRaw(jitCode, input->latin1Chars(nogc), input->length(), + startIndex, matches); + } + return ExecuteRaw(jitCode, input->twoByteChars(nogc), input->length(), + startIndex, matches); + } + + return Interpret(cx, re, input, startIndex, matches); +} + +RegExpRunStatus ExecuteForFuzzing(JSContext* cx, Handle<JSAtom*> pattern, + Handle<JSLinearString*> input, + JS::RegExpFlags flags, size_t startIndex, + VectorMatchPairs* matches, + RegExpShared::CodeKind codeKind) { + RootedRegExpShared re(cx, cx->zone()->regExps().get(cx, pattern, flags)); + if (!RegExpShared::compileIfNecessary(cx, &re, input, codeKind)) { + return RegExpRunStatus_Error; + } + return RegExpShared::execute(cx, &re, input, startIndex, matches); +} + +bool GrowBacktrackStack(RegExpStack* regexp_stack) { + return SMRegExpMacroAssembler::GrowBacktrackStack(regexp_stack); +} + +uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength) { + return SMRegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( + substring1, substring2, byteLength); +} + +uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength) { + return SMRegExpMacroAssembler::CaseInsensitiveCompareUnicode( + substring1, substring2, byteLength); +} + +bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges) { + return SMRegExpMacroAssembler::IsCharacterInRangeArray(c, ranges); +} + +#ifdef DEBUG +bool IsolateShouldSimulateInterrupt(Isolate* isolate) { + return isolate->shouldSimulateInterrupt_ != 0; +} + +void IsolateSetShouldSimulateInterrupt(Isolate* isolate) { + isolate->shouldSimulateInterrupt_ = 1; +} +void IsolateClearShouldSimulateInterrupt(Isolate* isolate) { + isolate->shouldSimulateInterrupt_ = 0; +} +#endif + +} // namespace irregexp +} // namespace js diff --git a/js/src/irregexp/RegExpAPI.h b/js/src/irregexp/RegExpAPI.h new file mode 100644 index 0000000000..b4c37f6f23 --- /dev/null +++ b/js/src/irregexp/RegExpAPI.h @@ -0,0 +1,96 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* This is the interface that the regexp engine exposes to SpiderMonkey. */ + +#ifndef regexp_RegExpAPI_h +#define regexp_RegExpAPI_h + +#include "mozilla/Maybe.h" +#include "mozilla/MemoryReporting.h" +#include "mozilla/Range.h" + +#include <stddef.h> +#include <stdint.h> + +#include "jstypes.h" + +#include "irregexp/RegExpTypes.h" +#include "js/Stack.h" // JS::NativeStackLimit +#include "vm/RegExpShared.h" + +struct JS_PUBLIC_API JSContext; +class JS_PUBLIC_API JSTracer; + +namespace JS { +class RegExpFlags; +} + +namespace v8::internal { +class RegExpStack; +} + +namespace js { + +class VectorMatchPairs; +class LifoAlloc; + +namespace frontend { +class TokenStreamAnyChars; +} + +namespace irregexp { + +Isolate* CreateIsolate(JSContext* cx); +void TraceIsolate(JSTracer* trc, Isolate* isolate); +void DestroyIsolate(Isolate* isolate); + +size_t IsolateSizeOfIncludingThis(Isolate* isolate, + mozilla::MallocSizeOf mallocSizeOf); + +bool CheckPatternSyntax(js::LifoAlloc& alloc, JS::NativeStackLimit stackLimit, + frontend::TokenStreamAnyChars& ts, + const mozilla::Range<const char16_t> chars, + JS::RegExpFlags flags, + mozilla::Maybe<uint32_t> line = mozilla::Nothing(), + mozilla::Maybe<uint32_t> column = mozilla::Nothing()); +bool CheckPatternSyntax(JSContext* cx, JS::NativeStackLimit stackLimit, + frontend::TokenStreamAnyChars& ts, + Handle<JSAtom*> pattern, JS::RegExpFlags flags); + +bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, + RegExpShared::CodeKind codeKind); + +RegExpRunStatus Execute(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, size_t start, + VectorMatchPairs* matches); + +RegExpRunStatus ExecuteForFuzzing(JSContext* cx, Handle<JSAtom*> pattern, + Handle<JSLinearString*> input, + JS::RegExpFlags flags, size_t startIndex, + VectorMatchPairs* matches, + RegExpShared::CodeKind codeKind); + +bool GrowBacktrackStack(v8::internal::RegExpStack* regexp_stack); + +uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength); +uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength); +bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges); + +#ifdef DEBUG +bool IsolateShouldSimulateInterrupt(Isolate* isolate); +void IsolateSetShouldSimulateInterrupt(Isolate* isolate); +void IsolateClearShouldSimulateInterrupt(Isolate* isolate); +#endif +} // namespace irregexp +} // namespace js + +#endif /* regexp_RegExpAPI_h */ diff --git a/js/src/irregexp/RegExpNativeMacroAssembler.cpp b/js/src/irregexp/RegExpNativeMacroAssembler.cpp new file mode 100644 index 0000000000..b6b13ca3d8 --- /dev/null +++ b/js/src/irregexp/RegExpNativeMacroAssembler.cpp @@ -0,0 +1,1406 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-macro-assembler-arch.h" +#include "irregexp/imported/regexp-stack.h" +#include "irregexp/imported/special-case.h" +#include "jit/Linker.h" +#include "jit/PerfSpewer.h" +#include "vm/MatchPairs.h" +#include "vm/Realm.h" +#ifdef MOZ_VTUNE +# include "vtune/VTuneWrapper.h" +#endif + +#include "jit/ABIFunctionList-inl.h" +#include "jit/MacroAssembler-inl.h" + +namespace v8 { +namespace internal { + +using js::MatchPairs; +using js::jit::AbsoluteAddress; +using js::jit::Address; +using js::jit::AllocatableGeneralRegisterSet; +using js::jit::Assembler; +using js::jit::BaseIndex; +using js::jit::CodeLocationLabel; +using js::jit::GeneralRegisterBackwardIterator; +using js::jit::GeneralRegisterForwardIterator; +using js::jit::GeneralRegisterSet; +using js::jit::Imm32; +using js::jit::ImmPtr; +using js::jit::ImmWord; +using js::jit::JitCode; +using js::jit::Linker; +using js::jit::LiveGeneralRegisterSet; +using js::jit::Register; +using js::jit::Registers; +using js::jit::StackMacroAssembler; + +SMRegExpMacroAssembler::SMRegExpMacroAssembler(JSContext* cx, + StackMacroAssembler& masm, + Zone* zone, Mode mode, + uint32_t num_capture_registers) + : NativeRegExpMacroAssembler(cx->isolate.ref(), zone), + cx_(cx), + masm_(masm), + mode_(mode), + num_registers_(num_capture_registers), + num_capture_registers_(num_capture_registers) { + // Each capture has a start and an end register + MOZ_ASSERT(num_capture_registers_ % 2 == 0); + + AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All()); + + input_end_pointer_ = regs.takeAny(); + current_character_ = regs.takeAny(); + current_position_ = regs.takeAny(); + backtrack_stack_pointer_ = regs.takeAny(); + temp0_ = regs.takeAny(); + temp1_ = regs.takeAny(); + if (!regs.empty()) { + // Not enough registers on x86. + temp2_ = regs.takeAny(); + } + savedRegisters_ = js::jit::SavedNonVolatileRegisters(regs); + + masm_.jump(&entry_label_); // We'll generate the entry code later + masm_.bind(&start_label_); // and continue from here. +} + +int SMRegExpMacroAssembler::stack_limit_slack() { + return RegExpStack::kStackLimitSlack; +} + +void SMRegExpMacroAssembler::AdvanceCurrentPosition(int by) { + if (by != 0) { + masm_.addPtr(Imm32(by * char_size()), current_position_); + } +} + +void SMRegExpMacroAssembler::AdvanceRegister(int reg, int by) { + MOZ_ASSERT(reg >= 0 && reg < num_registers_); + if (by != 0) { + masm_.addPtr(Imm32(by), register_location(reg)); + } +} + +void SMRegExpMacroAssembler::Backtrack() { +#ifdef DEBUG + js::jit::Label bailOut; + // Check for simulating interrupt + masm_.branch32(Assembler::NotEqual, + AbsoluteAddress(&cx_->isolate->shouldSimulateInterrupt_), + Imm32(0), &bailOut); +#endif + // Check for an interrupt. We have to restart from the beginning if we + // are interrupted, so we only check for urgent interrupts. + js::jit::Label noInterrupt; + masm_.branchTest32( + Assembler::Zero, AbsoluteAddress(cx_->addressOfInterruptBits()), + Imm32(uint32_t(js::InterruptReason::CallbackUrgent)), &noInterrupt); +#ifdef DEBUG + // bailing out if we have simulating interrupt flag set + masm_.bind(&bailOut); +#endif + masm_.movePtr(ImmWord(js::RegExpRunStatus_Error), temp0_); + masm_.jump(&exit_label_); + masm_.bind(&noInterrupt); + + // Pop code location from backtrack stack and jump to location. + Pop(temp0_); + masm_.jump(temp0_); +} + +void SMRegExpMacroAssembler::Bind(Label* label) { + masm_.bind(label->inner()); + if (label->patchOffset_.bound()) { + AddLabelPatch(label->patchOffset_, label->pos()); + } +} + +// Check if current_position + cp_offset is the input start +void SMRegExpMacroAssembler::CheckAtStartImpl(int cp_offset, Label* on_cond, + Assembler::Condition cond) { + Address addr(current_position_, cp_offset * char_size()); + masm_.computeEffectiveAddress(addr, temp0_); + + masm_.branchPtr(cond, inputStart(), temp0_, LabelOrBacktrack(on_cond)); +} + +void SMRegExpMacroAssembler::CheckAtStart(int cp_offset, Label* on_at_start) { + CheckAtStartImpl(cp_offset, on_at_start, Assembler::Equal); +} + +void SMRegExpMacroAssembler::CheckNotAtStart(int cp_offset, + Label* on_not_at_start) { + CheckAtStartImpl(cp_offset, on_not_at_start, Assembler::NotEqual); +} + +void SMRegExpMacroAssembler::CheckCharacterImpl(Imm32 c, Label* on_cond, + Assembler::Condition cond) { + masm_.branch32(cond, current_character_, c, LabelOrBacktrack(on_cond)); +} + +void SMRegExpMacroAssembler::CheckCharacter(uint32_t c, Label* on_equal) { + CheckCharacterImpl(Imm32(c), on_equal, Assembler::Equal); +} + +void SMRegExpMacroAssembler::CheckNotCharacter(uint32_t c, + Label* on_not_equal) { + CheckCharacterImpl(Imm32(c), on_not_equal, Assembler::NotEqual); +} + +void SMRegExpMacroAssembler::CheckCharacterGT(base::uc16 limit, + Label* on_greater) { + CheckCharacterImpl(Imm32(limit), on_greater, Assembler::GreaterThan); +} + +void SMRegExpMacroAssembler::CheckCharacterLT(base::uc16 limit, + Label* on_less) { + CheckCharacterImpl(Imm32(limit), on_less, Assembler::LessThan); +} + +// Bitwise-and the current character with mask and then check for a +// match with c. +void SMRegExpMacroAssembler::CheckCharacterAfterAndImpl(uint32_t c, + uint32_t mask, + Label* on_cond, + bool is_not) { + if (c == 0) { + Assembler::Condition cond = is_not ? Assembler::NonZero : Assembler::Zero; + masm_.branchTest32(cond, current_character_, Imm32(mask), + LabelOrBacktrack(on_cond)); + } else { + Assembler::Condition cond = is_not ? Assembler::NotEqual : Assembler::Equal; + masm_.move32(Imm32(mask), temp0_); + masm_.and32(current_character_, temp0_); + masm_.branch32(cond, temp0_, Imm32(c), LabelOrBacktrack(on_cond)); + } +} + +void SMRegExpMacroAssembler::CheckCharacterAfterAnd(uint32_t c, uint32_t mask, + Label* on_equal) { + CheckCharacterAfterAndImpl(c, mask, on_equal, /*is_not =*/false); +} + +void SMRegExpMacroAssembler::CheckNotCharacterAfterAnd(uint32_t c, + uint32_t mask, + Label* on_not_equal) { + CheckCharacterAfterAndImpl(c, mask, on_not_equal, /*is_not =*/true); +} + +// Subtract minus from the current character, then bitwise-and the +// result with mask, then check for a match with c. +void SMRegExpMacroAssembler::CheckNotCharacterAfterMinusAnd( + base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) { + masm_.computeEffectiveAddress(Address(current_character_, -minus), temp0_); + if (c == 0) { + masm_.branchTest32(Assembler::NonZero, temp0_, Imm32(mask), + LabelOrBacktrack(on_not_equal)); + } else { + masm_.and32(Imm32(mask), temp0_); + masm_.branch32(Assembler::NotEqual, temp0_, Imm32(c), + LabelOrBacktrack(on_not_equal)); + } +} + +// If the current position matches the position stored on top of the backtrack +// stack, pops the backtrack stack and branches to the given label. +void SMRegExpMacroAssembler::CheckGreedyLoop(Label* on_equal) { + js::jit::Label fallthrough; + masm_.branchPtr(Assembler::NotEqual, Address(backtrack_stack_pointer_, 0), + current_position_, &fallthrough); + masm_.addPtr(Imm32(sizeof(void*)), backtrack_stack_pointer_); // Pop. + JumpOrBacktrack(on_equal); + masm_.bind(&fallthrough); +} + +void SMRegExpMacroAssembler::CheckCharacterInRangeImpl( + base::uc16 from, base::uc16 to, Label* on_cond, Assembler::Condition cond) { + // x is in [from,to] if unsigned(x - from) <= to - from + masm_.computeEffectiveAddress(Address(current_character_, -from), temp0_); + masm_.branch32(cond, temp0_, Imm32(to - from), LabelOrBacktrack(on_cond)); +} + +void SMRegExpMacroAssembler::CheckCharacterInRange(base::uc16 from, + base::uc16 to, + Label* on_in_range) { + CheckCharacterInRangeImpl(from, to, on_in_range, Assembler::BelowOrEqual); +} + +void SMRegExpMacroAssembler::CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, + Label* on_not_in_range) { + CheckCharacterInRangeImpl(from, to, on_not_in_range, Assembler::Above); +} + +/* static */ +bool SMRegExpMacroAssembler::IsCharacterInRangeArray(uint32_t c, + ByteArrayData* ranges) { + js::AutoUnsafeCallWithABI unsafe; + MOZ_ASSERT(ranges->length % sizeof(uint16_t) == 0); + uint32_t length = ranges->length / sizeof(uint16_t); + MOZ_ASSERT(length > 0); + + // Fast paths. + if (c < ranges->getTyped<uint16_t>(0)) { + // |c| is lower than the start of the first range. + // It is not in the range array. + return false; + } + if (c >= ranges->getTyped<uint16_t>(length - 1)) { + // |c| is higher than the last entry. If the table contains an odd + // number of entries, the last range is open-ended, so |c| is in + // the range array iff |length| is odd. + return (length % 2) != 0; + } + + // |ranges| is stored as an interval list: an ordered list of + // starting points, where every even index marks the beginning of a + // range of characters that are included, and every odd index marks + // the beginning of a range of characters that are excluded. For + // example, the set [1,2,3,7,8,9] would be represented as the + // range array [1,4,7,10]. If |ranges| has an odd number of entries, + // the last included range is open-ended (so the set containing + // every character would be represented as [0]). + // + // Because of the symmetry between included and excluded ranges, we + // can do a binary search for the index in |ranges| with the value + // closest to but not exceeding |c|. If that index is even, |c| is + // in an included range. If that index is odd, |c| is in an excluded + // range. + uint32_t lower = 0; + uint32_t upper = length; + uint32_t mid = 0; + do { + mid = lower + (upper - lower) / 2; + const base::uc16 elem = ranges->getTyped<uint16_t>(mid); + if (c < elem) { + upper = mid; + } else if (c > elem) { + lower = mid + 1; + } else { + break; + } + } while (lower < upper); + uint32_t rangeIndex = c < ranges->getTyped<uint16_t>(mid) ? mid - 1 : mid; + + // Included ranges start at even indices and end at odd indices. + return rangeIndex % 2 == 0; +} + +void SMRegExpMacroAssembler::CallIsCharacterInRangeArray( + const ZoneList<CharacterRange>* ranges) { + Handle<ByteArray> rangeArray = GetOrAddRangeArray(ranges); + masm_.movePtr(ImmPtr(rangeArray->inner()), temp0_); + + // Save volatile regs. Temp regs don't need to be saved. + LiveGeneralRegisterSet volatileRegs(GeneralRegisterSet::Volatile()); + volatileRegs.takeUnchecked(temp0_); + volatileRegs.takeUnchecked(temp1_); + if (temp2_ != js::jit::InvalidReg) { + volatileRegs.takeUnchecked(temp2_); + } + masm_.PushRegsInMask(volatileRegs); + + using Fn = bool (*)(uint32_t, ByteArrayData*); + masm_.setupUnalignedABICall(temp1_); + masm_.passABIArg(current_character_); + masm_.passABIArg(temp0_); + + masm_.callWithABI<Fn, ::js::irregexp::IsCharacterInRangeArray>(); + masm_.storeCallBoolResult(temp1_); + masm_.PopRegsInMask(volatileRegs); + + // GetOrAddRangeArray caches previously seen range arrays to reduce + // memory usage, so this may not be the first time we've seen this + // range array. We only need to transfer ownership from the + // HandleScope to the |tables_| vector once. + PseudoHandle<ByteArrayData> rawRangeArray = + rangeArray->maybeTakeOwnership(isolate()); + if (rawRangeArray) { + AddTable(std::move(rawRangeArray)); + } +} + +bool SMRegExpMacroAssembler::CheckCharacterInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_in_range) { + CallIsCharacterInRangeArray(ranges); + masm_.branchTest32(Assembler::NonZero, temp1_, temp1_, + LabelOrBacktrack(on_in_range)); + return true; +} + +bool SMRegExpMacroAssembler::CheckCharacterNotInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) { + CallIsCharacterInRangeArray(ranges); + masm_.branchTest32(Assembler::Zero, temp1_, temp1_, + LabelOrBacktrack(on_not_in_range)); + return true; +} + +void SMRegExpMacroAssembler::CheckBitInTable(Handle<ByteArray> table, + Label* on_bit_set) { + // Claim ownership of the ByteArray from the current HandleScope. + // ByteArrays are allocated on the C++ heap and are (eventually) + // owned by the RegExpShared. + PseudoHandle<ByteArrayData> rawTable = table->takeOwnership(isolate()); + + masm_.movePtr(ImmPtr(rawTable->data()), temp0_); + + masm_.move32(Imm32(kTableMask), temp1_); + masm_.and32(current_character_, temp1_); + + masm_.load8ZeroExtend(BaseIndex(temp0_, temp1_, js::jit::TimesOne), temp0_); + masm_.branchTest32(Assembler::NonZero, temp0_, temp0_, + LabelOrBacktrack(on_bit_set)); + + // Transfer ownership of |rawTable| to the |tables_| vector. + AddTable(std::move(rawTable)); +} + +void SMRegExpMacroAssembler::CheckNotBackReferenceImpl(int start_reg, + bool read_backward, + bool unicode, + Label* on_no_match, + bool ignore_case) { + js::jit::Label fallthrough; + + // Captures are stored as a sequential pair of registers. + // Find the length of the back-referenced capture and load the + // capture's start index into current_character_. + masm_.loadPtr(register_location(start_reg), // index of start + current_character_); + masm_.loadPtr(register_location(start_reg + 1), temp0_); // index of end + masm_.subPtr(current_character_, temp0_); // length of capture + + // Capture registers are either both set or both cleared. + // If the capture length is zero, then the capture is either empty or cleared. + // Fall through in both cases. + masm_.branchPtr(Assembler::Equal, temp0_, ImmWord(0), &fallthrough); + + // Check that there are sufficient characters left in the input. + if (read_backward) { + // If start + len > current, there isn't enough room for a + // lookbehind backreference. + masm_.loadPtr(inputStart(), temp1_); + masm_.addPtr(temp0_, temp1_); + masm_.branchPtr(Assembler::GreaterThan, temp1_, current_position_, + LabelOrBacktrack(on_no_match)); + } else { + // current_position_ is the negative offset from the end. + // If current + len > 0, there isn't enough room for a backreference. + masm_.movePtr(current_position_, temp1_); + masm_.addPtr(temp0_, temp1_); + masm_.branchPtr(Assembler::GreaterThan, temp1_, ImmWord(0), + LabelOrBacktrack(on_no_match)); + } + + if (mode_ == UC16 && ignore_case) { + // We call a helper function for case-insensitive non-latin1 strings. + + // Save volatile regs. temp1_, temp2_, and current_character_ + // don't need to be saved. current_position_ needs to be saved + // even if it's non-volatile, because we modify it to use as an argument. + LiveGeneralRegisterSet volatileRegs(GeneralRegisterSet::Volatile()); + volatileRegs.addUnchecked(current_position_); + volatileRegs.takeUnchecked(temp1_); + if (temp2_ != js::jit::InvalidReg) { + volatileRegs.takeUnchecked(temp2_); + } + volatileRegs.takeUnchecked(current_character_); + masm_.PushRegsInMask(volatileRegs); + + // Parameters are + // Address captured - Address of captured substring's start. + // Address current - Address of current character position. + // size_t byte_length - length of capture (in bytes) + + // Compute |captured| + masm_.addPtr(input_end_pointer_, current_character_); + + // Compute |current| + masm_.addPtr(input_end_pointer_, current_position_); + if (read_backward) { + // Offset by length when matching backwards. + masm_.subPtr(temp0_, current_position_); + } + + using Fn = uint32_t (*)(const char16_t*, const char16_t*, size_t); + masm_.setupUnalignedABICall(temp1_); + masm_.passABIArg(current_character_); + masm_.passABIArg(current_position_); + masm_.passABIArg(temp0_); + + if (unicode) { + masm_.callWithABI<Fn, ::js::irregexp::CaseInsensitiveCompareUnicode>(); + } else { + masm_.callWithABI<Fn, ::js::irregexp::CaseInsensitiveCompareNonUnicode>(); + } + masm_.storeCallInt32Result(temp1_); + masm_.PopRegsInMask(volatileRegs); + masm_.branchTest32(Assembler::Zero, temp1_, temp1_, + LabelOrBacktrack(on_no_match)); + + // On success, advance position by length of capture + if (read_backward) { + masm_.subPtr(temp0_, current_position_); + } else { + masm_.addPtr(temp0_, current_position_); + } + + masm_.bind(&fallthrough); + return; + } + + // We will be modifying current_position_. Save it in case the match fails. + masm_.push(current_position_); + + // Compute start of capture string + masm_.addPtr(input_end_pointer_, current_character_); + + // Compute start of match string + masm_.addPtr(input_end_pointer_, current_position_); + if (read_backward) { + // Offset by length when matching backwards. + masm_.subPtr(temp0_, current_position_); + } + + // Compute end of match string + masm_.addPtr(current_position_, temp0_); + + Register nextCaptureChar = temp1_; + Register nextMatchChar = temp2_; + + if (temp2_ == js::jit::InvalidReg) { + masm_.push(backtrack_stack_pointer_); + nextMatchChar = backtrack_stack_pointer_; + } + + js::jit::Label success; + js::jit::Label fail; + js::jit::Label loop; + masm_.bind(&loop); + + // Load next character from each string. + if (mode_ == LATIN1) { + masm_.load8ZeroExtend(Address(current_character_, 0), nextCaptureChar); + masm_.load8ZeroExtend(Address(current_position_, 0), nextMatchChar); + } else { + masm_.load16ZeroExtend(Address(current_character_, 0), nextCaptureChar); + masm_.load16ZeroExtend(Address(current_position_, 0), nextMatchChar); + } + + if (ignore_case) { + MOZ_ASSERT(mode_ == LATIN1); + // Try exact match. + js::jit::Label loop_increment; + masm_.branch32(Assembler::Equal, nextCaptureChar, nextMatchChar, + &loop_increment); + + // Mismatch. Try case-insensitive match. + // Force the capture character to lower case (by setting bit 0x20) + // then check to see if it is a letter. + js::jit::Label convert_match; + masm_.or32(Imm32(0x20), nextCaptureChar); + + // Check if it is in [a,z]. + masm_.computeEffectiveAddress(Address(nextCaptureChar, -'a'), + nextMatchChar); + masm_.branch32(Assembler::BelowOrEqual, nextMatchChar, Imm32('z' - 'a'), + &convert_match); + // Check for values in range [224,254]. + // Exclude 247 (U+00F7 DIVISION SIGN). + masm_.sub32(Imm32(224 - 'a'), nextMatchChar); + masm_.branch32(Assembler::Above, nextMatchChar, Imm32(254 - 224), &fail); + masm_.branch32(Assembler::Equal, nextMatchChar, Imm32(247 - 224), &fail); + + // Capture character is lower case. Convert match character + // to lower case and compare. + masm_.bind(&convert_match); + masm_.load8ZeroExtend(Address(current_position_, 0), nextMatchChar); + masm_.or32(Imm32(0x20), nextMatchChar); + masm_.branch32(Assembler::NotEqual, nextCaptureChar, nextMatchChar, &fail); + + masm_.bind(&loop_increment); + } else { + // Fail if characters do not match. + masm_.branch32(Assembler::NotEqual, nextCaptureChar, nextMatchChar, &fail); + } + + // Increment pointers into match and capture strings. + masm_.addPtr(Imm32(char_size()), current_character_); + masm_.addPtr(Imm32(char_size()), current_position_); + + // Loop if we have not reached the end of the match string. + masm_.branchPtr(Assembler::Below, current_position_, temp0_, &loop); + masm_.jump(&success); + + // If we fail, restore current_position_ and branch. + masm_.bind(&fail); + if (temp2_ == js::jit::InvalidReg) { + // Restore backtrack_stack_pointer_ when it was used as a temp register. + masm_.pop(backtrack_stack_pointer_); + } + masm_.pop(current_position_); + JumpOrBacktrack(on_no_match); + + masm_.bind(&success); + + if (temp2_ == js::jit::InvalidReg) { + // Restore backtrack_stack_pointer_ when it was used as a temp register. + masm_.pop(backtrack_stack_pointer_); + } + // Drop saved value of current_position_ + masm_.addToStackPtr(Imm32(sizeof(uintptr_t))); + + // current_position_ is a pointer. Convert it back to an offset. + masm_.subPtr(input_end_pointer_, current_position_); + if (read_backward) { + // Subtract match length if we matched backward + masm_.addPtr(register_location(start_reg), current_position_); + masm_.subPtr(register_location(start_reg + 1), current_position_); + } + + masm_.bind(&fallthrough); +} + +// Branch if a back-reference does not match a previous capture. +void SMRegExpMacroAssembler::CheckNotBackReference(int start_reg, + bool read_backward, + Label* on_no_match) { + CheckNotBackReferenceImpl(start_reg, read_backward, /*unicode = */ false, + on_no_match, /*ignore_case = */ false); +} + +void SMRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + CheckNotBackReferenceImpl(start_reg, read_backward, unicode, on_no_match, + /*ignore_case = */ true); +} + +// Checks whether the given offset from the current position is +// inside the input string. +void SMRegExpMacroAssembler::CheckPosition(int cp_offset, + Label* on_outside_input) { + // Note: current_position_ is a (negative) byte offset relative to + // the end of the input string. + if (cp_offset >= 0) { + // end + current + offset >= end + // <=> current + offset >= 0 + // <=> current >= -offset + masm_.branchPtr(Assembler::GreaterThanOrEqual, current_position_, + ImmWord(-cp_offset * char_size()), + LabelOrBacktrack(on_outside_input)); + } else { + // Compute offset position + masm_.computeEffectiveAddress( + Address(current_position_, cp_offset * char_size()), temp0_); + + // Compare to start of input. + masm_.branchPtr(Assembler::GreaterThan, inputStart(), temp0_, + LabelOrBacktrack(on_outside_input)); + } +} + +// This function attempts to generate special case code for character classes. +// Returns true if a special case is generated. +// Otherwise returns false and generates no code. +bool SMRegExpMacroAssembler::CheckSpecialCharacterClass( + StandardCharacterSet type, Label* on_no_match) { + js::jit::Label* no_match = LabelOrBacktrack(on_no_match); + + // Note: throughout this function, range checks (c in [min, max]) + // are implemented by an unsigned (c - min) <= (max - min) check. + switch (type) { + case StandardCharacterSet::kWhitespace: { + // Match space-characters + if (mode_ != LATIN1) { + return false; + } + js::jit::Label success; + // One byte space characters are ' ', '\t'..'\r', and '\u00a0' (NBSP). + + // Check ' ' + masm_.branch32(Assembler::Equal, current_character_, Imm32(' '), + &success); + + // Check '\t'..'\r' + masm_.computeEffectiveAddress(Address(current_character_, -'\t'), temp0_); + masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32('\r' - '\t'), + &success); + + // Check \u00a0. + masm_.branch32(Assembler::NotEqual, temp0_, Imm32(0x00a0 - '\t'), + no_match); + + masm_.bind(&success); + return true; + } + case StandardCharacterSet::kNotWhitespace: + // The emitted code for generic character classes is good enough. + return false; + case StandardCharacterSet::kDigit: + // Match latin1 digits ('0'-'9') + masm_.computeEffectiveAddress(Address(current_character_, -'0'), temp0_); + masm_.branch32(Assembler::Above, temp0_, Imm32('9' - '0'), no_match); + return true; + case StandardCharacterSet::kNotDigit: + // Match anything except latin1 digits ('0'-'9') + masm_.computeEffectiveAddress(Address(current_character_, -'0'), temp0_); + masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32('9' - '0'), + no_match); + return true; + case StandardCharacterSet::kNotLineTerminator: + // Match non-newlines. This excludes '\n' (0x0a), '\r' (0x0d), + // U+2028 LINE SEPARATOR, and U+2029 PARAGRAPH SEPARATOR. + // See https://tc39.es/ecma262/#prod-LineTerminator + + // To test for 0x0a and 0x0d efficiently, we XOR the input with 1. + // This converts 0x0a to 0x0b, and 0x0d to 0x0c, allowing us to + // test for the contiguous range 0x0b..0x0c. + masm_.move32(current_character_, temp0_); + masm_.xor32(Imm32(0x01), temp0_); + masm_.sub32(Imm32(0x0b), temp0_); + masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32(0x0c - 0x0b), + no_match); + + if (mode_ == UC16) { + // Compare original value to 0x2028 and 0x2029, using the already + // computed (current_char ^ 0x01 - 0x0b). I.e., check for + // 0x201d (0x2028 - 0x0b) or 0x201e. + masm_.sub32(Imm32(0x2028 - 0x0b), temp0_); + masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32(0x2029 - 0x2028), + no_match); + } + return true; + case StandardCharacterSet::kWord: + // \w matches the set of 63 characters defined in Runtime Semantics: + // WordCharacters. We use a static lookup table, which is defined in + // regexp-macro-assembler.cc. + // Note: if both Unicode and IgnoreCase are true, \w matches a + // larger set of characters. That case is handled elsewhere. + if (mode_ != LATIN1) { + masm_.branch32(Assembler::Above, current_character_, Imm32('z'), + no_match); + } + static_assert(arraysize(word_character_map) > unibrow::Latin1::kMaxChar); + masm_.movePtr(ImmPtr(word_character_map), temp0_); + masm_.load8ZeroExtend( + BaseIndex(temp0_, current_character_, js::jit::TimesOne), temp0_); + masm_.branchTest32(Assembler::Zero, temp0_, temp0_, no_match); + return true; + case StandardCharacterSet::kNotWord: { + // See 'w' above. + js::jit::Label done; + if (mode_ != LATIN1) { + masm_.branch32(Assembler::Above, current_character_, Imm32('z'), &done); + } + static_assert(arraysize(word_character_map) > unibrow::Latin1::kMaxChar); + masm_.movePtr(ImmPtr(word_character_map), temp0_); + masm_.load8ZeroExtend( + BaseIndex(temp0_, current_character_, js::jit::TimesOne), temp0_); + masm_.branchTest32(Assembler::NonZero, temp0_, temp0_, no_match); + if (mode_ != LATIN1) { + masm_.bind(&done); + } + return true; + } + //////////////////////////////////////////////////////////////////////// + // Non-standard classes (with no syntactic shorthand) used internally // + //////////////////////////////////////////////////////////////////////// + case StandardCharacterSet::kEverything: + // Match any character + return true; + case StandardCharacterSet::kLineTerminator: + // Match newlines. The opposite of '.'. See '.' above. + masm_.move32(current_character_, temp0_); + masm_.xor32(Imm32(0x01), temp0_); + masm_.sub32(Imm32(0x0b), temp0_); + if (mode_ == LATIN1) { + masm_.branch32(Assembler::Above, temp0_, Imm32(0x0c - 0x0b), no_match); + } else { + MOZ_ASSERT(mode_ == UC16); + js::jit::Label done; + masm_.branch32(Assembler::BelowOrEqual, temp0_, Imm32(0x0c - 0x0b), + &done); + + // Compare original value to 0x2028 and 0x2029, using the already + // computed (current_char ^ 0x01 - 0x0b). I.e., check for + // 0x201d (0x2028 - 0x0b) or 0x201e. + masm_.sub32(Imm32(0x2028 - 0x0b), temp0_); + masm_.branch32(Assembler::Above, temp0_, Imm32(0x2029 - 0x2028), + no_match); + masm_.bind(&done); + } + return true; + } + return false; +} + +void SMRegExpMacroAssembler::Fail() { + masm_.movePtr(ImmWord(js::RegExpRunStatus_Success_NotFound), temp0_); + masm_.jump(&exit_label_); +} + +void SMRegExpMacroAssembler::GoTo(Label* to) { + masm_.jump(LabelOrBacktrack(to)); +} + +void SMRegExpMacroAssembler::IfRegisterGE(int reg, int comparand, + Label* if_ge) { + masm_.branchPtr(Assembler::GreaterThanOrEqual, register_location(reg), + ImmWord(comparand), LabelOrBacktrack(if_ge)); +} + +void SMRegExpMacroAssembler::IfRegisterLT(int reg, int comparand, + Label* if_lt) { + masm_.branchPtr(Assembler::LessThan, register_location(reg), + ImmWord(comparand), LabelOrBacktrack(if_lt)); +} + +void SMRegExpMacroAssembler::IfRegisterEqPos(int reg, Label* if_eq) { + masm_.branchPtr(Assembler::Equal, register_location(reg), current_position_, + LabelOrBacktrack(if_eq)); +} + +// This is a word-for-word identical copy of the V8 code, which is +// duplicated in at least nine different places in V8 (one per +// supported architecture) with no differences outside of comments and +// formatting. It should be hoisted into the superclass. Once that is +// done upstream, this version can be deleted. +void SMRegExpMacroAssembler::LoadCurrentCharacterImpl(int cp_offset, + Label* on_end_of_input, + bool check_bounds, + int characters, + int eats_at_least) { + // It's possible to preload a small number of characters when each success + // path requires a large number of characters, but not the reverse. + MOZ_ASSERT(eats_at_least >= characters); + MOZ_ASSERT(cp_offset < (1 << 30)); // Be sane! (And ensure negation works) + + if (check_bounds) { + if (cp_offset >= 0) { + CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input); + } else { + CheckPosition(cp_offset, on_end_of_input); + } + } + LoadCurrentCharacterUnchecked(cp_offset, characters); +} + +// Load the character (or characters) at the specified offset from the +// current position. Zero-extend to 32 bits. +void SMRegExpMacroAssembler::LoadCurrentCharacterUnchecked(int cp_offset, + int characters) { + BaseIndex address(input_end_pointer_, current_position_, js::jit::TimesOne, + cp_offset * char_size()); + if (mode_ == LATIN1) { + if (characters == 4) { + masm_.load32(address, current_character_); + } else if (characters == 2) { + masm_.load16ZeroExtend(address, current_character_); + } else { + MOZ_ASSERT(characters == 1); + masm_.load8ZeroExtend(address, current_character_); + } + } else { + MOZ_ASSERT(mode_ == UC16); + if (characters == 2) { + masm_.load32(address, current_character_); + } else { + MOZ_ASSERT(characters == 1); + masm_.load16ZeroExtend(address, current_character_); + } + } +} + +void SMRegExpMacroAssembler::PopCurrentPosition() { Pop(current_position_); } + +void SMRegExpMacroAssembler::PopRegister(int register_index) { + Pop(temp0_); + masm_.storePtr(temp0_, register_location(register_index)); +} + +void SMRegExpMacroAssembler::PushBacktrack(Label* label) { + MOZ_ASSERT(!label->is_bound()); + MOZ_ASSERT(!label->patchOffset_.bound()); + label->patchOffset_ = masm_.movWithPatch(ImmPtr(nullptr), temp0_); + MOZ_ASSERT(label->patchOffset_.bound()); + + Push(temp0_); + + CheckBacktrackStackLimit(); +} + +void SMRegExpMacroAssembler::PushCurrentPosition() { Push(current_position_); } + +void SMRegExpMacroAssembler::PushRegister(int register_index, + StackCheckFlag check_stack_limit) { + masm_.loadPtr(register_location(register_index), temp0_); + Push(temp0_); + if (check_stack_limit) { + CheckBacktrackStackLimit(); + } +} + +void SMRegExpMacroAssembler::ReadCurrentPositionFromRegister(int reg) { + masm_.loadPtr(register_location(reg), current_position_); +} + +void SMRegExpMacroAssembler::WriteCurrentPositionToRegister(int reg, + int cp_offset) { + if (cp_offset == 0) { + masm_.storePtr(current_position_, register_location(reg)); + } else { + Address addr(current_position_, cp_offset * char_size()); + masm_.computeEffectiveAddress(addr, temp0_); + masm_.storePtr(temp0_, register_location(reg)); + } +} + +// Note: The backtrack stack pointer is stored in a register as an +// offset from the stack top, not as a bare pointer, so that it is not +// corrupted if the backtrack stack grows (and therefore moves). +void SMRegExpMacroAssembler::ReadStackPointerFromRegister(int reg) { + masm_.loadPtr(register_location(reg), backtrack_stack_pointer_); + masm_.addPtr(backtrackStackBase(), backtrack_stack_pointer_); +} +void SMRegExpMacroAssembler::WriteStackPointerToRegister(int reg) { + masm_.movePtr(backtrack_stack_pointer_, temp0_); + masm_.subPtr(backtrackStackBase(), temp0_); + masm_.storePtr(temp0_, register_location(reg)); +} + +// When matching a regexp that is anchored at the end, this operation +// is used to try skipping the beginning of long strings. If the +// maximum length of a match is less than the length of the string, we +// can skip the initial len - max_len bytes. +void SMRegExpMacroAssembler::SetCurrentPositionFromEnd(int by) { + js::jit::Label after_position; + masm_.branchPtr(Assembler::GreaterThanOrEqual, current_position_, + ImmWord(-by * char_size()), &after_position); + masm_.movePtr(ImmWord(-by * char_size()), current_position_); + + // On RegExp code entry (where this operation is used), the character before + // the current position is expected to be already loaded. + // We have advanced the position, so it's safe to read backwards. + LoadCurrentCharacterUnchecked(-1, 1); + masm_.bind(&after_position); +} + +void SMRegExpMacroAssembler::SetRegister(int register_index, int to) { + MOZ_ASSERT(register_index >= num_capture_registers_); + masm_.storePtr(ImmWord(to), register_location(register_index)); +} + +// Returns true if a regexp match can be restarted (aka the regexp is global). +// The return value is not used anywhere, but we implement it to be safe. +bool SMRegExpMacroAssembler::Succeed() { + masm_.jump(&success_label_); + return global(); +} + +// Capture registers are initialized to input[-1] +void SMRegExpMacroAssembler::ClearRegisters(int reg_from, int reg_to) { + MOZ_ASSERT(reg_from <= reg_to); + masm_.loadPtr(inputStart(), temp0_); + masm_.subPtr(Imm32(char_size()), temp0_); + for (int reg = reg_from; reg <= reg_to; reg++) { + masm_.storePtr(temp0_, register_location(reg)); + } +} + +void SMRegExpMacroAssembler::Push(Register source) { + MOZ_ASSERT(source != backtrack_stack_pointer_); + + masm_.subPtr(Imm32(sizeof(void*)), backtrack_stack_pointer_); + masm_.storePtr(source, Address(backtrack_stack_pointer_, 0)); +} + +void SMRegExpMacroAssembler::Pop(Register target) { + MOZ_ASSERT(target != backtrack_stack_pointer_); + + masm_.loadPtr(Address(backtrack_stack_pointer_, 0), target); + masm_.addPtr(Imm32(sizeof(void*)), backtrack_stack_pointer_); +} + +void SMRegExpMacroAssembler::JumpOrBacktrack(Label* to) { + if (to) { + masm_.jump(to->inner()); + } else { + Backtrack(); + } +} + +// Generate a quick inline test for backtrack stack overflow. +// If the test fails, call an OOL handler to try growing the stack. +void SMRegExpMacroAssembler::CheckBacktrackStackLimit() { + js::jit::Label no_stack_overflow; + masm_.branchPtr( + Assembler::BelowOrEqual, + AbsoluteAddress(isolate()->regexp_stack()->limit_address_address()), + backtrack_stack_pointer_, &no_stack_overflow); + + masm_.call(&stack_overflow_label_); + + // Exit with an exception if the call failed + masm_.branchTest32(Assembler::Zero, temp0_, temp0_, + &exit_with_exception_label_); + + masm_.bind(&no_stack_overflow); +} + +// This is used to sneak an OOM through the V8 layer. +static Handle<HeapObject> DummyCode() { + return Handle<HeapObject>::fromHandleValue(JS::UndefinedHandleValue); +} + +// Finalize code. This is called last, so that we know how many +// registers we need. +Handle<HeapObject> SMRegExpMacroAssembler::GetCode(Handle<String> source) { + if (!cx_->realm()->ensureJitRealmExists(cx_)) { + return DummyCode(); + } + + masm_.bind(&entry_label_); + + createStackFrame(); + initFrameAndRegs(); + + masm_.jump(&start_label_); + + successHandler(); + exitHandler(); + backtrackHandler(); + stackOverflowHandler(); + + Linker linker(masm_); + JitCode* code = linker.newCode(cx_, js::jit::CodeKind::RegExp); + if (!code) { + return DummyCode(); + } + + for (LabelPatch& lp : labelPatches_) { + Assembler::PatchDataWithValueCheck(CodeLocationLabel(code, lp.patchOffset_), + ImmPtr(code->raw() + lp.labelOffset_), + ImmPtr(nullptr)); + } + + CollectPerfSpewerJitCodeProfile(code, "RegExp"); + +#ifdef MOZ_VTUNE + js::vtune::MarkStub(code, "RegExp"); +#endif + + return Handle<HeapObject>(JS::PrivateGCThingValue(code), isolate()); +} + +/* + * The stack will have the following structure: + * sp-> - FrameData + * - inputStart + * - backtrack stack base + * - matches + * - numMatches + * - Registers + * - Capture positions + * - Scratch registers + * --- frame alignment --- + * - Saved register area + * fp-> - Frame pointer + * - Return address + */ +void SMRegExpMacroAssembler::createStackFrame() { +#ifdef JS_CODEGEN_ARM64 + // ARM64 communicates stack address via SP, but uses a pseudo-sp (PSP) for + // addressing. The register we use for PSP may however also be used by + // calling code, and it is nonvolatile, so save it. Do this as a special + // case first because the generic save/restore code needs the PSP to be + // initialized already. + MOZ_ASSERT(js::jit::PseudoStackPointer64.Is(masm_.GetStackPointer64())); + masm_.Str(js::jit::PseudoStackPointer64, + vixl::MemOperand(js::jit::sp, -16, vixl::PreIndex)); + + // Initialize the PSP from the SP. + masm_.initPseudoStackPtr(); +#endif + + masm_.Push(js::jit::FramePointer); + masm_.moveStackPtrTo(js::jit::FramePointer); + + // Push non-volatile registers which might be modified by jitcode. + for (GeneralRegisterForwardIterator iter(savedRegisters_); iter.more(); + ++iter) { + masm_.Push(*iter); + } + + // The pointer to InputOutputData is passed as the first argument. + // On x86 we have to load it off the stack into temp0_. + // On other platforms it is already in a register. +#ifdef JS_CODEGEN_X86 + Address ioDataAddr(js::jit::FramePointer, 2 * sizeof(void*)); + masm_.loadPtr(ioDataAddr, temp0_); +#else + if (js::jit::IntArgReg0 != temp0_) { + masm_.movePtr(js::jit::IntArgReg0, temp0_); + } +#endif + + // Start a new stack frame. + size_t frameBytes = sizeof(FrameData) + num_registers_ * sizeof(void*); + frameSize_ = js::jit::StackDecrementForCall(js::jit::ABIStackAlignment, + masm_.framePushed(), frameBytes); + masm_.reserveStack(frameSize_); + masm_.checkStackAlignment(); + + // Check if we have space on the stack. Use the *NoInterrupt stack limit to + // avoid failing repeatedly when the regex code is called from Ion JIT code. + // (See bug 1208819) + js::jit::Label stack_ok; + AbsoluteAddress limit_addr(cx_->addressOfJitStackLimitNoInterrupt()); + masm_.branchStackPtrRhs(Assembler::Below, limit_addr, &stack_ok); + + // There is not enough space on the stack. Exit with an exception. + masm_.movePtr(ImmWord(js::RegExpRunStatus_Error), temp0_); + masm_.jump(&exit_label_); + + masm_.bind(&stack_ok); +} + +void SMRegExpMacroAssembler::initFrameAndRegs() { + // At this point, an uninitialized stack frame has been created, + // and the address of the InputOutputData is in temp0_. + Register ioDataReg = temp0_; + + Register matchesReg = temp1_; + masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, matches)), + matchesReg); + + // Initialize output registers + // Use |backtrack_stack_pointer_| as an additional temp register. This is safe + // because we haven't yet written any data to |backtrack_stack_pointer_|. + Register extraTemp = backtrack_stack_pointer_; + + masm_.loadPtr(Address(matchesReg, MatchPairs::offsetOfPairs()), extraTemp); + masm_.storePtr(extraTemp, matches()); + masm_.load32(Address(matchesReg, MatchPairs::offsetOfPairCount()), extraTemp); + masm_.store32(extraTemp, numMatches()); + +#ifdef DEBUG + // Bounds-check numMatches. + js::jit::Label enoughRegisters; + masm_.branchPtr(Assembler::GreaterThanOrEqual, extraTemp, + ImmWord(num_capture_registers_ / 2), &enoughRegisters); + masm_.assumeUnreachable("Not enough output pairs for RegExp"); + masm_.bind(&enoughRegisters); +#endif + + // Load input start pointer. + masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, inputStart)), + current_position_); + + // Load input end pointer + masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, inputEnd)), + input_end_pointer_); + + // Set up input position to be negative offset from string end. + masm_.subPtr(input_end_pointer_, current_position_); + + // Store inputStart + masm_.storePtr(current_position_, inputStart()); + + // Load start index + Register startIndexReg = temp1_; + masm_.loadPtr(Address(ioDataReg, offsetof(InputOutputData, startIndex)), + startIndexReg); + masm_.computeEffectiveAddress( + BaseIndex(current_position_, startIndexReg, factor()), current_position_); + + // Initialize current_character_. + // Load newline if index is at start, or previous character otherwise. + js::jit::Label start_regexp; + js::jit::Label load_previous_character; + masm_.branchPtr(Assembler::NotEqual, startIndexReg, ImmWord(0), + &load_previous_character); + masm_.movePtr(ImmWord('\n'), current_character_); + masm_.jump(&start_regexp); + + masm_.bind(&load_previous_character); + LoadCurrentCharacterUnchecked(-1, 1); + masm_.bind(&start_regexp); + + // Initialize captured registers with inputStart - 1 + MOZ_ASSERT(num_capture_registers_ > 0); + Register inputStartMinusOneReg = temp0_; + masm_.loadPtr(inputStart(), inputStartMinusOneReg); + masm_.subPtr(Imm32(char_size()), inputStartMinusOneReg); + if (num_capture_registers_ > 8) { + masm_.movePtr(ImmWord(register_offset(0)), temp1_); + js::jit::Label init_loop; + masm_.bind(&init_loop); + masm_.storePtr(inputStartMinusOneReg, BaseIndex(masm_.getStackPointer(), + temp1_, js::jit::TimesOne)); + masm_.addPtr(ImmWord(sizeof(void*)), temp1_); + masm_.branchPtr(Assembler::LessThan, temp1_, + ImmWord(register_offset(num_capture_registers_)), + &init_loop); + } else { + // Unroll the loop + for (int i = 0; i < num_capture_registers_; i++) { + masm_.storePtr(inputStartMinusOneReg, register_location(i)); + } + } + + // Initialize backtrack stack pointer + masm_.loadPtr(AbsoluteAddress(ExternalReference::TopOfRegexpStack(isolate())), + backtrack_stack_pointer_); + masm_.storePtr(backtrack_stack_pointer_, backtrackStackBase()); +} + +// Called when we find a match. May not be generated if we can +// determine ahead of time that a regexp cannot match: for example, +// when compiling /\u1e9e/ for latin-1 inputs. +void SMRegExpMacroAssembler::successHandler() { + if (!success_label_.used()) { + return; + } + masm_.bind(&success_label_); + + // Copy captures to the MatchPairs pointed to by the InputOutputData. + // Captures are stored as positions, which are negative byte offsets + // from the end of the string. We must convert them to actual + // indices. + // + // Index: [ 0 ][ 1 ][ 2 ][ 3 ][ 4 ][ 5 ][END] + // Pos (1-byte): [-6 ][-5 ][-4 ][-3 ][-2 ][-1 ][ 0 ] // IS = -6 + // Pos (2-byte): [-12][-10][-8 ][-6 ][-4 ][-2 ][ 0 ] // IS = -12 + // + // To convert a position to an index, we subtract InputStart, and + // divide the result by char_size. + Register matchesReg = temp1_; + masm_.loadPtr(matches(), matchesReg); + + // Use |backtrack_stack_pointer_| as an additional temp register. This is safe + // because we don't read from |backtrack_stack_pointer_| after this point. + Register extraTemp = backtrack_stack_pointer_; + + Register inputStartReg = extraTemp; + masm_.loadPtr(inputStart(), inputStartReg); + + for (int i = 0; i < num_capture_registers_; i++) { + masm_.loadPtr(register_location(i), temp0_); + masm_.subPtr(inputStartReg, temp0_); + if (mode_ == UC16) { + masm_.rshiftPtrArithmetic(Imm32(1), temp0_); + } + masm_.store32(temp0_, Address(matchesReg, i * sizeof(int32_t))); + } + + masm_.movePtr(ImmWord(js::RegExpRunStatus_Success), temp0_); + // This falls through to the exit handler. +} + +void SMRegExpMacroAssembler::exitHandler() { + masm_.bind(&exit_label_); + + if (temp0_ != js::jit::ReturnReg) { + masm_.movePtr(temp0_, js::jit::ReturnReg); + } + + masm_.freeStack(frameSize_); + + // Restore registers which were saved on entry + for (GeneralRegisterBackwardIterator iter(savedRegisters_); iter.more(); + ++iter) { + masm_.Pop(*iter); + } + + masm_.Pop(js::jit::FramePointer); + +#ifdef JS_CODEGEN_ARM64 + // Now restore the value that was in the PSP register on entry, and return. + + // Obtain the correct SP from the PSP. + masm_.Mov(js::jit::sp, js::jit::PseudoStackPointer64); + + // Restore the saved value of the PSP register, this value is whatever the + // caller had saved in it, not any actual SP value, and it must not be + // overwritten subsequently. + masm_.Ldr(js::jit::PseudoStackPointer64, + vixl::MemOperand(js::jit::sp, 16, vixl::PostIndex)); + + // Perform a plain Ret(), as abiret() will move SP <- PSP and that is wrong. + masm_.Ret(vixl::lr); +#else + masm_.abiret(); +#endif + + if (exit_with_exception_label_.used()) { + masm_.bind(&exit_with_exception_label_); + + // Exit with an error result to signal thrown exception + masm_.movePtr(ImmWord(js::RegExpRunStatus_Error), temp0_); + masm_.jump(&exit_label_); + } +} + +void SMRegExpMacroAssembler::backtrackHandler() { + if (!backtrack_label_.used()) { + return; + } + masm_.bind(&backtrack_label_); + Backtrack(); +} + +void SMRegExpMacroAssembler::stackOverflowHandler() { + if (!stack_overflow_label_.used()) { + return; + } + + js::jit::AutoCreatedBy acb(masm_, + "SMRegExpMacroAssembler::stackOverflowHandler"); + + // Called if the backtrack-stack limit has been hit. + masm_.bind(&stack_overflow_label_); + + // Load argument + masm_.movePtr(ImmPtr(isolate()->regexp_stack()), temp1_); + + // Save registers before calling C function + LiveGeneralRegisterSet volatileRegs(GeneralRegisterSet::Volatile()); + +#ifdef JS_USE_LINK_REGISTER + masm_.pushReturnAddress(); +#endif + + // Adjust for the return address on the stack. + size_t frameOffset = sizeof(void*); + + volatileRegs.takeUnchecked(temp0_); + volatileRegs.takeUnchecked(temp1_); + masm_.PushRegsInMask(volatileRegs); + + using Fn = bool (*)(RegExpStack* regexp_stack); + masm_.setupUnalignedABICall(temp0_); + masm_.passABIArg(temp1_); + masm_.callWithABI<Fn, ::js::irregexp::GrowBacktrackStack>(); + masm_.storeCallBoolResult(temp0_); + + masm_.PopRegsInMask(volatileRegs); + + // If GrowBacktrackStack returned false, we have failed to grow the + // stack, and must exit with a stack-overflow exception. Do this in + // the caller so that the stack is adjusted by our return instruction. + js::jit::Label overflow_return; + masm_.branchTest32(Assembler::Zero, temp0_, temp0_, &overflow_return); + + // Otherwise, store the new backtrack stack base and recompute the new + // top of the stack. + Address bsbAddress(masm_.getStackPointer(), + offsetof(FrameData, backtrackStackBase) + frameOffset); + masm_.subPtr(bsbAddress, backtrack_stack_pointer_); + + masm_.loadPtr(AbsoluteAddress(ExternalReference::TopOfRegexpStack(isolate())), + temp1_); + masm_.storePtr(temp1_, bsbAddress); + masm_.addPtr(temp1_, backtrack_stack_pointer_); + + // Resume execution in calling code. + masm_.bind(&overflow_return); + masm_.ret(); +} + +// This is only used by tracing code. +// The return value doesn't matter. +RegExpMacroAssembler::IrregexpImplementation +SMRegExpMacroAssembler::Implementation() { + return kBytecodeImplementation; +} + +// Compare two strings in `/i` mode (ignoreCase, but not unicode). +/*static */ +uint32_t SMRegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( + const char16_t* substring1, const char16_t* substring2, size_t byteLength) { + js::AutoUnsafeCallWithABI unsafe; + + MOZ_ASSERT(byteLength % sizeof(char16_t) == 0); + size_t length = byteLength / sizeof(char16_t); + + for (size_t i = 0; i < length; i++) { + char16_t c1 = substring1[i]; + char16_t c2 = substring2[i]; + if (c1 != c2) { +#ifdef JS_HAS_INTL_API + // Non-unicode regexps have weird case-folding rules. + c1 = RegExpCaseFolding::Canonicalize(c1); + c2 = RegExpCaseFolding::Canonicalize(c2); +#else + // If we aren't building with ICU, fall back to `/iu` mode. The only + // differences are in corner cases. + c1 = js::unicode::FoldCase(c1); + c2 = js::unicode::FoldCase(c2); +#endif + if (c1 != c2) { + return 0; + } + } + } + + return 1; +} + +// Compare two strings in `/iu` mode (ignoreCase and unicode). +/*static */ +uint32_t SMRegExpMacroAssembler::CaseInsensitiveCompareUnicode( + const char16_t* substring1, const char16_t* substring2, size_t byteLength) { + js::AutoUnsafeCallWithABI unsafe; + + MOZ_ASSERT(byteLength % sizeof(char16_t) == 0); + size_t length = byteLength / sizeof(char16_t); + + for (size_t i = 0; i < length; i++) { + char16_t c1 = substring1[i]; + char16_t c2 = substring2[i]; + if (c1 != c2) { + // Unicode regexps use the common and simple case-folding + // mappings of the Unicode Character Database. + c1 = js::unicode::FoldCase(c1); + c2 = js::unicode::FoldCase(c2); + if (c1 != c2) { + return 0; + } + } + } + + return 1; +} + +/* static */ +bool SMRegExpMacroAssembler::GrowBacktrackStack(RegExpStack* regexp_stack) { + js::AutoUnsafeCallWithABI unsafe; + size_t size = regexp_stack->memory_size(); + return !!regexp_stack->EnsureCapacity(size * 2); +} + +bool SMRegExpMacroAssembler::CanReadUnaligned() const { +#if defined(JS_CODEGEN_ARM) + return !js::jit::HasAlignmentFault(); +#elif defined(JS_CODEGEN_MIPS32) || defined(JS_CODEGEN_MIPS64) + return false; +#else + return true; +#endif +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/RegExpNativeMacroAssembler.h b/js/src/irregexp/RegExpNativeMacroAssembler.h new file mode 100644 index 0000000000..3c3acf40d4 --- /dev/null +++ b/js/src/irregexp/RegExpNativeMacroAssembler.h @@ -0,0 +1,308 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file implements the NativeRegExpMacroAssembler interface for +// SpiderMonkey. It provides the same interface as each of V8's +// architecture-specific implementations. + +#ifndef RegexpMacroAssemblerArch_h +#define RegexpMacroAssemblerArch_h + +#include "irregexp/imported/regexp-macro-assembler.h" +#include "jit/MacroAssembler.h" + +namespace v8 { +namespace internal { + +struct FrameData { + // Character position at the start of the input, stored as a + // negative offset from the end of the string (input_end_pointer_). + size_t inputStart; + + // The backtrack_stack_pointer_ register points to the top of the stack. + // This points to the bottom of the backtrack stack. + void* backtrackStackBase; + + // Copy of the input MatchPairs. + int32_t* matches; // pointer to capture array + int32_t numMatches; // size of capture array +}; + +class SMRegExpMacroAssembler final : public NativeRegExpMacroAssembler { + public: + SMRegExpMacroAssembler(JSContext* cx, js::jit::StackMacroAssembler& masm, + Zone* zone, Mode mode, uint32_t num_capture_registers); + virtual ~SMRegExpMacroAssembler() = default; + + virtual int stack_limit_slack(); + virtual IrregexpImplementation Implementation(); + + virtual bool Succeed(); + virtual void Fail(); + + virtual void AdvanceCurrentPosition(int by); + virtual void PopCurrentPosition(); + virtual void PushCurrentPosition(); + virtual void SetCurrentPositionFromEnd(int by); + + virtual void Backtrack(); + virtual void Bind(Label* label); + virtual void GoTo(Label* label); + virtual void PushBacktrack(Label* label); + + virtual void CheckCharacter(uint32_t c, Label* on_equal); + virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); + virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater); + virtual void CheckCharacterLT(base::uc16 limit, Label* on_less); + virtual void CheckCharacterAfterAnd(uint32_t c, uint32_t mask, + Label* on_equal); + virtual void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask, + Label* on_not_equal); + virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 mask, + Label* on_not_equal); + virtual void CheckGreedyLoop(Label* on_tos_equals_current_position); + virtual void CheckCharacterInRange(base::uc16 from, base::uc16 to, + Label* on_in_range); + virtual void CheckCharacterNotInRange(base::uc16 from, base::uc16 to, + Label* on_not_in_range); + virtual bool CheckCharacterInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_in_range); + virtual bool CheckCharacterNotInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_not_in_range); + virtual void CheckAtStart(int cp_offset, Label* on_at_start); + virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start); + virtual void CheckPosition(int cp_offset, Label* on_outside_input); + virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set); + virtual bool CheckSpecialCharacterClass(StandardCharacterSet type, + Label* on_no_match); + virtual void CheckNotBackReference(int start_reg, bool read_backward, + Label* on_no_match); + virtual void CheckNotBackReferenceIgnoreCase(int start_reg, + bool read_backward, bool unicode, + Label* on_no_match); + + virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, + bool check_bounds, int characters, + int eats_at_least); + + virtual void AdvanceRegister(int reg, int by); + virtual void IfRegisterGE(int reg, int comparand, Label* if_ge); + virtual void IfRegisterLT(int reg, int comparand, Label* if_lt); + virtual void IfRegisterEqPos(int reg, Label* if_eq); + virtual void PopRegister(int register_index); + virtual void PushRegister(int register_index, + StackCheckFlag check_stack_limit); + virtual void ReadCurrentPositionFromRegister(int reg); + virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); + virtual void ReadStackPointerFromRegister(int reg); + virtual void WriteStackPointerToRegister(int reg); + virtual void SetRegister(int register_index, int to); + virtual void ClearRegisters(int reg_from, int reg_to); + + virtual Handle<HeapObject> GetCode(Handle<String> source); + + virtual bool CanReadUnaligned() const; + + private: + size_t frameSize_ = 0; + + void createStackFrame(); + void initFrameAndRegs(); + void successHandler(); + void exitHandler(); + void backtrackHandler(); + void stackOverflowHandler(); + + // Push a register on the backtrack stack. + void Push(js::jit::Register value); + + // Pop a value from the backtrack stack. + void Pop(js::jit::Register target); + + void CheckAtStartImpl(int cp_offset, Label* on_cond, + js::jit::Assembler::Condition cond); + void CheckCharacterImpl(js::jit::Imm32 c, Label* on_cond, + js::jit::Assembler::Condition cond); + void CheckCharacterAfterAndImpl(uint32_t c, uint32_t and_with, Label* on_cond, + bool negate); + void CheckCharacterInRangeImpl(base::uc16 from, base::uc16 to, Label* on_cond, + js::jit::Assembler::Condition cond); + void CheckNotBackReferenceImpl(int start_reg, bool read_backward, + bool unicode, Label* on_no_match, + bool ignore_case); + void CallIsCharacterInRangeArray(const ZoneList<CharacterRange>* ranges); + + void LoadCurrentCharacterUnchecked(int cp_offset, int characters); + + void JumpOrBacktrack(Label* to); + + // MacroAssembler methods that take a Label can be called with a + // null label, which means that we should backtrack if we would jump + // to that label. This is a helper to avoid writing out the same + // logic a dozen times. + inline js::jit::Label* LabelOrBacktrack(Label* to) { + return to ? to->inner() : &backtrack_label_; + } + + void CheckBacktrackStackLimit(); + + public: + static bool GrowBacktrackStack(RegExpStack* regexp_stack); + + static uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength); + static uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength); + static bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges); + + private: + inline int char_size() { return static_cast<int>(mode_); } + inline js::jit::Scale factor() { + return mode_ == UC16 ? js::jit::TimesTwo : js::jit::TimesOne; + } + + js::jit::Address inputStart() { + return js::jit::Address(masm_.getStackPointer(), + offsetof(FrameData, inputStart)); + } + js::jit::Address backtrackStackBase() { + return js::jit::Address(masm_.getStackPointer(), + offsetof(FrameData, backtrackStackBase)); + } + js::jit::Address matches() { + return js::jit::Address(masm_.getStackPointer(), + offsetof(FrameData, matches)); + } + js::jit::Address numMatches() { + return js::jit::Address(masm_.getStackPointer(), + offsetof(FrameData, numMatches)); + } + + // The stack-pointer-relative location of a regexp register. + js::jit::Address register_location(int register_index) { + return js::jit::Address(masm_.getStackPointer(), + register_offset(register_index)); + } + + int32_t register_offset(int register_index) { + MOZ_ASSERT(register_index >= 0 && register_index <= kMaxRegister); + if (num_registers_ <= register_index) { + num_registers_ = register_index + 1; + } + static_assert(alignof(uintptr_t) <= alignof(FrameData)); + return sizeof(FrameData) + register_index * sizeof(uintptr_t*); + } + + JSContext* cx_; + js::jit::StackMacroAssembler& masm_; + + /* + * This assembler uses the following registers: + * + * - current_character_: + * Contains the character (or characters) currently being examined. + * Must be loaded using LoadCurrentCharacter before using any of the + * dispatch methods. After a matching pass for a global regexp, + * temporarily stores the index of capture start. + * - current_position_: + * Current position in input *as negative byte offset from end of string*. + * - input_end_pointer_: + * Points to byte after last character in the input. current_position_ is + * relative to this. + * - backtrack_stack_pointer_: + * Points to tip of the (heap-allocated) backtrack stack. The stack grows + * downward (like the native stack). + * - temp0_, temp1_, temp2_: + * Scratch registers. + * + * The native stack pointer is used to access arguments (InputOutputData), + * local variables (FrameData), and irregexp's internal virtual registers + * (see register_location). + */ + + js::jit::Register current_character_; + js::jit::Register current_position_; + js::jit::Register input_end_pointer_; + js::jit::Register backtrack_stack_pointer_; + js::jit::Register temp0_, temp1_, temp2_; + + // These labels are used in various API calls and bound (if used) in + // GetCode. If we abort in the middle of a compilation, as may + // happen if a regexp is too big, they may be used but not + // bound. + js::jit::NonAssertingLabel entry_label_; + js::jit::NonAssertingLabel start_label_; + js::jit::NonAssertingLabel backtrack_label_; + js::jit::NonAssertingLabel success_label_; + js::jit::NonAssertingLabel exit_label_; + js::jit::NonAssertingLabel stack_overflow_label_; + js::jit::NonAssertingLabel exit_with_exception_label_; + + // When we generate the code to push a backtrack label's address + // onto the backtrack stack, we don't know its final address. We + // have to patch it after linking. This is slightly delicate, as the + // Label itself (which is allocated on the stack) may not exist by + // the time we link. The approach is as follows: + // + // 1. When we push a label on the backtrack stack (PushBacktrack), + // we bind the label's patchOffset_ field to the offset within + // the code that should be overwritten. This works because each + // label is only pushed by a single instruction. + // + // 2. When we bind a label (Bind), we check to see if it has a + // bound patchOffset_. If it does, we create a LabelPatch mapping + // its patch offset to the offset of the label itself. + // + // 3. While linking the code, we walk the list of label patches + // and patch the code accordingly. + class LabelPatch { + public: + LabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset) + : patchOffset_(patchOffset), labelOffset_(labelOffset) {} + + js::jit::CodeOffset patchOffset_; + size_t labelOffset_ = 0; + }; + + js::Vector<LabelPatch, 4, js::SystemAllocPolicy> labelPatches_; + void AddLabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + if (!labelPatches_.emplaceBack(patchOffset, labelOffset)) { + oomUnsafe.crash("Irregexp label patch"); + } + } + + Mode mode_; + int num_registers_; + int num_capture_registers_; + js::jit::LiveGeneralRegisterSet savedRegisters_; + + public: + using TableVector = + js::Vector<PseudoHandle<ByteArrayData>, 4, js::SystemAllocPolicy>; + TableVector& tables() { return tables_; } + + private: + TableVector tables_; + void AddTable(PseudoHandle<ByteArrayData> table) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + if (!tables_.append(std::move(table))) { + oomUnsafe.crash("Irregexp table append"); + } + } +}; + +} // namespace internal +} // namespace v8 + +#endif // RegexpMacroAssemblerArch_h diff --git a/js/src/irregexp/RegExpShim.cpp b/js/src/irregexp/RegExpShim.cpp new file mode 100644 index 0000000000..2b2c3cd4a0 --- /dev/null +++ b/js/src/irregexp/RegExpShim.cpp @@ -0,0 +1,297 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/RegExpShim.h" + +#include "mozilla/MemoryReporting.h" + +#include <iostream> + +#include "irregexp/imported/regexp-macro-assembler.h" +#include "irregexp/imported/regexp-stack.h" + +#include "vm/NativeObject-inl.h" + +namespace v8 { +namespace internal { + +void PrintF(const char* format, ...) { + va_list arguments; + va_start(arguments, format); + vprintf(format, arguments); + va_end(arguments); +} + +void PrintF(FILE* out, const char* format, ...) { + va_list arguments; + va_start(arguments, format); + vfprintf(out, format, arguments); + va_end(arguments); +} + +StdoutStream::operator std::ostream&() const { return std::cerr; } + +template <typename T> +std::ostream& StdoutStream::operator<<(T t) { + return std::cerr << t; +} + +template std::ostream& StdoutStream::operator<<(char const* c); + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/ostreams.cc#L120-L169 +// (This is a hand-simplified version.) +// Writes the given character to the output escaping everything outside +// of printable ASCII range. +std::ostream& operator<<(std::ostream& os, const AsUC16& c) { + base::uc16 v = c.value; + bool isPrint = 0x20 < v && v <= 0x7e; + char buf[10]; + const char* format = isPrint ? "%c" : (v <= 0xFF) ? "\\x%02x" : "\\u%04x"; + SprintfLiteral(buf, format, v); + return os << buf; +} +std::ostream& operator<<(std::ostream& os, const AsUC32& c) { + int32_t v = c.value; + if (v <= String::kMaxUtf16CodeUnit) { + return os << AsUC16(v); + } + char buf[13]; + SprintfLiteral(buf, "\\u{%06x}", v); + return os << buf; +} + +HandleScope::HandleScope(Isolate* isolate) : isolate_(isolate) { + isolate->openHandleScope(*this); +} + +HandleScope::~HandleScope() { + isolate_->closeHandleScope(level_, non_gc_level_); +} + +template <typename T> +Handle<T>::Handle(T object, Isolate* isolate) + : location_(isolate->getHandleLocation(object.value())) {} + +template Handle<ByteArray>::Handle(ByteArray b, Isolate* isolate); +template Handle<HeapObject>::Handle(const JS::Value& v, Isolate* isolate); +template Handle<JSRegExp>::Handle(JSRegExp re, Isolate* isolate); +template Handle<String>::Handle(String s, Isolate* isolate); + +template <typename T> +Handle<T>::Handle(const JS::Value& value, Isolate* isolate) + : location_(isolate->getHandleLocation(value)) { + T::cast(Object(value)); // Assert that value has the correct type. +} + +JS::Value* Isolate::getHandleLocation(const JS::Value& value) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + if (!handleArena_.Append(value)) { + oomUnsafe.crash("Irregexp handle allocation"); + } + return &handleArena_.GetLast(); +} + +void* Isolate::allocatePseudoHandle(size_t bytes) { + PseudoHandle<void> ptr; + ptr.reset(js_malloc(bytes)); + if (!ptr) { + return nullptr; + } + if (!uniquePtrArena_.Append(std::move(ptr))) { + return nullptr; + } + return uniquePtrArena_.GetLast().get(); +} + +template <typename T> +PseudoHandle<T> Isolate::takeOwnership(void* ptr) { + PseudoHandle<T> result = maybeTakeOwnership<T>(ptr); + MOZ_ASSERT(result); + return result; +} + +template <typename T> +PseudoHandle<T> Isolate::maybeTakeOwnership(void* ptr) { + for (auto iter = uniquePtrArena_.IterFromLast(); !iter.Done(); iter.Prev()) { + auto& entry = iter.Get(); + if (entry.get() == ptr) { + PseudoHandle<T> result; + result.reset(static_cast<T*>(entry.release())); + return result; + } + } + return PseudoHandle<T>(); +} + +PseudoHandle<ByteArrayData> ByteArray::maybeTakeOwnership(Isolate* isolate) { + PseudoHandle<ByteArrayData> result = + isolate->maybeTakeOwnership<ByteArrayData>(value().toPrivate()); + setValue(JS::PrivateValue(nullptr)); + return result; +} + +PseudoHandle<ByteArrayData> ByteArray::takeOwnership(Isolate* isolate) { + PseudoHandle<ByteArrayData> result = maybeTakeOwnership(isolate); + MOZ_ASSERT(result); + return result; +} + +void Isolate::trace(JSTracer* trc) { + js::gc::AssertRootMarkingPhase(trc); + + for (auto iter = handleArena_.Iter(); !iter.Done(); iter.Next()) { + auto& elem = iter.Get(); + JS::GCPolicy<JS::Value>::trace(trc, &elem, "Isolate handle arena"); + } +} + +size_t Isolate::sizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf) const { + size_t size = mallocSizeOf(this); + + size += mallocSizeOf(regexpStack_); + size += ExternalReference::SizeOfExcludingThis(mallocSizeOf, regexpStack_); + + size += handleArena_.SizeOfExcludingThis(mallocSizeOf); + size += uniquePtrArena_.SizeOfExcludingThis(mallocSizeOf); + return size; +} + +/*static*/ Handle<String> String::Flatten(Isolate* isolate, + Handle<String> string) { + if (string->IsFlat()) { + return string; + } + js::AutoEnterOOMUnsafeRegion oomUnsafe; + JSLinearString* linear = string->str()->ensureLinear(isolate->cx()); + if (!linear) { + oomUnsafe.crash("Irregexp String::Flatten"); + } + return Handle<String>(JS::StringValue(linear), isolate); +} + +// This is only used for trace messages printing the source pattern of +// a regular expression. We have to return a unique_ptr, but we don't +// care about the contents, so we return an empty null-terminated string. +std::unique_ptr<char[]> String::ToCString() { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + + std::unique_ptr<char[]> ptr; + ptr.reset(static_cast<char*>(js_malloc(1))); + if (!ptr) { + oomUnsafe.crash("Irregexp String::ToCString"); + } + ptr[0] = '\0'; + + return ptr; +} + +bool Isolate::init() { + regexpStack_ = js_new<RegExpStack>(); + if (!regexpStack_) { + return false; + } + return true; +} + +Isolate::~Isolate() { + if (regexpStack_) { + js_delete(regexpStack_); + } +} + +/* static */ +const void* ExternalReference::TopOfRegexpStack(Isolate* isolate) { + return reinterpret_cast<const void*>( + isolate->regexp_stack()->memory_top_address_address()); +} + +/* static */ +size_t ExternalReference::SizeOfExcludingThis( + mozilla::MallocSizeOf mallocSizeOf, RegExpStack* regexpStack) { + if (regexpStack->thread_local_.owns_memory_) { + return mallocSizeOf(regexpStack->thread_local_.memory_); + } + return 0; +} + +Handle<ByteArray> Isolate::NewByteArray(int length, AllocationType alloc) { + MOZ_RELEASE_ASSERT(length >= 0); + + js::AutoEnterOOMUnsafeRegion oomUnsafe; + + size_t alloc_size = sizeof(uint32_t) + length; + ByteArrayData* data = + static_cast<ByteArrayData*>(allocatePseudoHandle(alloc_size)); + if (!data) { + oomUnsafe.crash("Irregexp NewByteArray"); + } + data->length = length; + + return Handle<ByteArray>(JS::PrivateValue(data), this); +} + +Handle<FixedArray> Isolate::NewFixedArray(int length) { + MOZ_RELEASE_ASSERT(length >= 0); + js::AutoEnterOOMUnsafeRegion oomUnsafe; + js::ArrayObject* array = js::NewDenseFullyAllocatedArray(cx(), length); + if (!array) { + oomUnsafe.crash("Irregexp NewFixedArray"); + } + array->ensureDenseInitializedLength(0, length); + return Handle<FixedArray>(JS::ObjectValue(*array), this); +} + +template <typename T> +Handle<FixedIntegerArray<T>> Isolate::NewFixedIntegerArray(uint32_t length) { + MOZ_RELEASE_ASSERT(length < std::numeric_limits<uint32_t>::max() / sizeof(T)); + js::AutoEnterOOMUnsafeRegion oomUnsafe; + + uint32_t rawLength = length * sizeof(T); + size_t allocSize = sizeof(ByteArrayData) + rawLength; + ByteArrayData* data = + static_cast<ByteArrayData*>(allocatePseudoHandle(allocSize)); + if (!data) { + oomUnsafe.crash("Irregexp NewFixedIntegerArray"); + } + data->length = rawLength; + + return Handle<FixedIntegerArray<T>>(JS::PrivateValue(data), this); +} + +template <typename T> +Handle<FixedIntegerArray<T>> FixedIntegerArray<T>::New(Isolate* isolate, + uint32_t length) { + return isolate->NewFixedIntegerArray<T>(length); +} + +template class FixedIntegerArray<uint16_t>; + +template <typename CharT> +Handle<String> Isolate::InternalizeString( + const base::Vector<const CharT>& str) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + JSAtom* atom = js::AtomizeChars(cx(), str.begin(), str.length()); + if (!atom) { + oomUnsafe.crash("Irregexp InternalizeString"); + } + return Handle<String>(JS::StringValue(atom), this); +} + +template Handle<String> Isolate::InternalizeString( + const base::Vector<const uint8_t>& str); +template Handle<String> Isolate::InternalizeString( + const base::Vector<const char16_t>& str); + +static_assert(JSRegExp::RegistersForCaptureCount(JSRegExp::kMaxCaptures) <= + RegExpMacroAssembler::kMaxRegisterCount); + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/RegExpShim.h b/js/src/irregexp/RegExpShim.h new file mode 100644 index 0000000000..e503ffb096 --- /dev/null +++ b/js/src/irregexp/RegExpShim.h @@ -0,0 +1,1283 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef RegexpShim_h +#define RegexpShim_h + +#include "mozilla/Assertions.h" +#include "mozilla/Attributes.h" +#include "mozilla/MathAlgorithms.h" +#include "mozilla/Maybe.h" +#include "mozilla/SegmentedVector.h" +#include "mozilla/Sprintf.h" +#include "mozilla/Types.h" + +#include <algorithm> +#include <cctype> +#include <iterator> + +#include "irregexp/RegExpTypes.h" +#include "irregexp/util/FlagsShim.h" +#include "irregexp/util/VectorShim.h" +#include "irregexp/util/ZoneShim.h" +#include "jit/JitCode.h" +#include "jit/Label.h" +#include "jit/shared/Assembler-shared.h" +#include "js/friend/StackLimits.h" // js::AutoCheckRecursionLimit +#include "js/RegExpFlags.h" +#include "js/Value.h" +#include "threading/ExclusiveData.h" +#include "util/DifferentialTesting.h" +#include "vm/JSContext.h" +#include "vm/MutexIDs.h" +#include "vm/NativeObject.h" +#include "vm/RegExpShared.h" + +// Forward declaration of classes +namespace v8 { +namespace internal { + +class Heap; +class Isolate; +class RegExpMatchInfo; +class RegExpStack; + +template <typename T> +class Handle; + +} // namespace internal +} // namespace v8 + +#define V8_WARN_UNUSED_RESULT [[nodiscard]] +#define V8_EXPORT_PRIVATE +#define V8_FALLTHROUGH [[fallthrough]] +#define V8_NODISCARD [[nodiscard]] +#define V8_NOEXCEPT noexcept + +#define FATAL(x) MOZ_CRASH(x) +#define UNREACHABLE() MOZ_CRASH("unreachable code") +#define UNIMPLEMENTED() MOZ_CRASH("unimplemented code") +#define STATIC_ASSERT(exp) static_assert(exp, #exp) + +#define DCHECK MOZ_ASSERT +#define DCHECK_EQ(lhs, rhs) MOZ_ASSERT((lhs) == (rhs)) +#define DCHECK_NE(lhs, rhs) MOZ_ASSERT((lhs) != (rhs)) +#define DCHECK_GT(lhs, rhs) MOZ_ASSERT((lhs) > (rhs)) +#define DCHECK_GE(lhs, rhs) MOZ_ASSERT((lhs) >= (rhs)) +#define DCHECK_LT(lhs, rhs) MOZ_ASSERT((lhs) < (rhs)) +#define DCHECK_LE(lhs, rhs) MOZ_ASSERT((lhs) <= (rhs)) +#define DCHECK_NULL(val) MOZ_ASSERT((val) == nullptr) +#define DCHECK_NOT_NULL(val) MOZ_ASSERT((val) != nullptr) +#define DCHECK_IMPLIES(lhs, rhs) MOZ_ASSERT_IF(lhs, rhs) +#define CHECK MOZ_RELEASE_ASSERT +#define CHECK_EQ(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) == (rhs)) +#define CHECK_LE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) <= (rhs)) +#define CHECK_GE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) >= (rhs)) +#define CONSTEXPR_DCHECK MOZ_ASSERT + +#define MemCopy memcpy + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L310-L319 +// ptrdiff_t is 't' according to the standard, but MSVC uses 'I'. +#ifdef _MSC_VER +# define V8PRIxPTRDIFF "Ix" +# define V8PRIdPTRDIFF "Id" +# define V8PRIuPTRDIFF "Iu" +#else +# define V8PRIxPTRDIFF "tx" +# define V8PRIdPTRDIFF "td" +# define V8PRIuPTRDIFF "tu" +#endif + +#define arraysize std::size + +// Explicitly declare the assignment operator as deleted. +#define DISALLOW_ASSIGN(TypeName) TypeName& operator=(const TypeName&) = delete + +// Explicitly declare the copy constructor and assignment operator as deleted. +// This also deletes the implicit move constructor and implicit move assignment +// operator, but still allows to manually define them. +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + DISALLOW_ASSIGN(TypeName) + +// Explicitly declare all implicit constructors as deleted, namely the +// default constructor, copy constructor and operator= functions. +// This is especially useful for classes containing only static methods. +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName() = delete; \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +namespace v8 { + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L364-L367 +template <typename T, typename U> +constexpr inline bool IsAligned(T value, U alignment) { + return (value & (alignment - 1)) == 0; +} + +using byte = uint8_t; +using Address = uintptr_t; +static const Address kNullAddress = 0; + +inline uintptr_t GetCurrentStackPosition() { + return reinterpret_cast<uintptr_t>(__builtin_frame_address(0)); +} + +namespace base { + +// Latin1/UTF-16 constants +// Code-point values in Unicode 4.0 are 21 bits wide. +// Code units in UTF-16 are 16 bits wide. +using uc16 = char16_t; +using uc32 = uint32_t; + +constexpr int kUC16Size = sizeof(base::uc16); + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L247-L258 +// The USE(x, ...) template is used to silence C++ compiler warnings +// issued for (yet) unused variables (typically parameters). +// The arguments are guaranteed to be evaluated from left to right. +struct Use { + template <typename T> + Use(T&&) {} // NOLINT(runtime/explicit) +}; +#define USE(...) \ + do { \ + ::v8::base::Use unused_tmp_array_for_use_macro[]{__VA_ARGS__}; \ + (void)unused_tmp_array_for_use_macro; \ + } while (false) + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/safe_conversions.h#L35-L39 +// saturated_cast<> is analogous to static_cast<> for numeric types, except +// that the specified numeric conversion will saturate rather than overflow or +// underflow. +template <typename Dst, typename Src> +inline Dst saturated_cast(Src value); + +// This is the only specialization that is needed for regexp code. +// Instead of pulling in dozens of lines of template goo +// to derive it, I used the implementation from uint8_clamped in +// ArrayBufferObject.h. +template <> +inline uint8_t saturated_cast<uint8_t, int>(int x) { + return (x >= 0) ? ((x < 255) ? uint8_t(x) : 255) : 0; +} + +// Origin: +// https://github.com/v8/v8/blob/fc088cdaccadede84886eee881e67af9db53669a/src/base/bounds.h#L14-L28 +// Checks if value is in range [lower_limit, higher_limit] using a single +// branch. +template <typename T, typename U> +inline constexpr bool IsInRange(T value, U lower_limit, U higher_limit) { + using unsigned_T = typename std::make_unsigned<T>::type; + // Use static_cast to support enum classes. + return static_cast<unsigned_T>(static_cast<unsigned_T>(value) - + static_cast<unsigned_T>(lower_limit)) <= + static_cast<unsigned_T>(static_cast<unsigned_T>(higher_limit) - + static_cast<unsigned_T>(lower_limit)); +} + +#define LAZY_INSTANCE_INITIALIZER \ + {} + +template <typename T> +class LazyInstanceImpl { + public: + LazyInstanceImpl() : value_(js::mutexid::IrregexpLazyStatic) {} + + const T* Pointer() { + auto val = value_.lock(); + if (val->isNothing()) { + val->emplace(); + } + return val->ptr(); + } + + private: + js::ExclusiveData<mozilla::Maybe<T>> value_; +}; + +template <typename T> +class LazyInstance { + public: + using type = LazyInstanceImpl<T>; +}; + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/utils.h#L40-L48 +// Returns the value (0 .. 15) of a hexadecimal character c. +// If c is not a legal hexadecimal character, returns a value < 0. +// Used in regexp-parser.cc +inline int HexValue(base::uc32 c) { + c -= '0'; + if (static_cast<unsigned>(c) <= 9) return c; + c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. + if (static_cast<unsigned>(c) <= 5) return c + 10; + return -1; +} + +template <typename... Args> +[[nodiscard]] uint32_t hash_combine(uint32_t aHash, Args... aArgs) { + return mozilla::AddToHash(aHash, aArgs...); +} + +template <typename T> +class Optional { + mozilla::Maybe<T> inner_; + + public: + Optional() = default; + Optional(T t) { inner_.emplace(t); } + + bool has_value() const { return inner_.isSome(); } + const T& value() const { return inner_.ref(); } +}; + +namespace bits { + +inline uint64_t CountTrailingZeros(uint64_t value) { + return mozilla::CountTrailingZeroes64(value); +} + +inline size_t RoundUpToPowerOfTwo32(size_t value) { + return mozilla::RoundUpPow2(value); +} + +template <typename T> +constexpr bool IsPowerOfTwo(T value) { + return value > 0 && (value & (value - 1)) == 0; +} + +} // namespace bits +} // namespace base + +namespace unibrow { + +using uchar = unsigned int; + +// Origin: +// https://github.com/v8/v8/blob/1f1e4cdb04c75eab77adbecd5f5514ddc3eb56cf/src/strings/unicode.h#L133-L150 +class Latin1 { + public: + static const base::uc16 kMaxChar = 0xff; + + // Convert the character to Latin-1 case equivalent if possible. + static inline base::uc16 TryConvertToLatin1(base::uc16 c) { + // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN". + // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN". + if (c == 0x039C || c == 0x03BC) { + return 0xB5; + } + // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER + // Y WITH DIAERESIS". + if (c == 0x0178) { + return 0xFF; + } + return c; + } +}; + +// Origin: +// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L99-L131 +class Utf16 { + public: + static inline bool IsLeadSurrogate(int code) { + return js::unicode::IsLeadSurrogate(code); + } + static inline bool IsTrailSurrogate(int code) { + return js::unicode::IsTrailSurrogate(code); + } + static inline base::uc16 LeadSurrogate(uint32_t char_code) { + return js::unicode::LeadSurrogate(char_code); + } + static inline base::uc16 TrailSurrogate(uint32_t char_code) { + return js::unicode::TrailSurrogate(char_code); + } + static inline uint32_t CombineSurrogatePair(char16_t lead, char16_t trail) { + return js::unicode::UTF16Decode(lead, trail); + } + static const uchar kMaxNonSurrogateCharCode = 0xffff; +}; + +#ifndef V8_INTL_SUPPORT + +// A cache used in case conversion. It caches the value for characters +// that either have no mapping or map to a single character independent +// of context. Characters that map to more than one character or that +// map differently depending on context are always looked up. +// Origin: +// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L64-L88 +template <class T, int size = 256> +class Mapping { + public: + inline Mapping() = default; + inline int get(uchar c, uchar n, uchar* result) { + CacheEntry entry = entries_[c & kMask]; + if (entry.code_point_ == c) { + if (entry.offset_ == 0) { + return 0; + } else { + result[0] = c + entry.offset_; + return 1; + } + } else { + return CalculateValue(c, n, result); + } + } + + private: + int CalculateValue(uchar c, uchar n, uchar* result) { + bool allow_caching = true; + int length = T::Convert(c, n, result, &allow_caching); + if (allow_caching) { + if (length == 1) { + entries_[c & kMask] = CacheEntry(c, result[0] - c); + return 1; + } else { + entries_[c & kMask] = CacheEntry(c, 0); + return 0; + } + } else { + return length; + } + } + + struct CacheEntry { + inline CacheEntry() : code_point_(kNoChar), offset_(0) {} + inline CacheEntry(uchar code_point, signed offset) + : code_point_(code_point), offset_(offset) {} + uchar code_point_; + signed offset_; + static const int kNoChar = (1 << 21) - 1; + }; + static const int kSize = size; + static const int kMask = kSize - 1; + CacheEntry entries_[kSize]; +}; + +// Origin: +// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L241-L252 +struct Ecma262Canonicalize { + static const int kMaxWidth = 1; + static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); +}; +struct Ecma262UnCanonicalize { + static const int kMaxWidth = 4; + static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); +}; +struct CanonicalizationRange { + static const int kMaxWidth = 1; + static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); +}; + +#endif // !V8_INTL_SUPPORT + +struct Letter { + static bool Is(uchar c); +}; + +} // namespace unibrow + +namespace internal { + +#define PRINTF_FORMAT(x, y) MOZ_FORMAT_PRINTF(x, y) +void PRINTF_FORMAT(1, 2) PrintF(const char* format, ...); +void PRINTF_FORMAT(2, 3) PrintF(FILE* out, const char* format, ...); + +// Superclass for classes only using static method functions. +// The subclass of AllStatic cannot be instantiated at all. +class AllStatic { +#ifdef DEBUG + public: + AllStatic() = delete; +#endif +}; + +// Superclass for classes managed with new and delete. +// In irregexp, this is only AlternativeGeneration (in regexp-compiler.cc) +// Compare: +// https://github.com/v8/v8/blob/7b3332844212d78ee87a9426f3a6f7f781a8fbfa/src/utils/allocation.cc#L88-L96 +class Malloced { + public: + static void* operator new(size_t size) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + void* result = js_malloc(size); + if (!result) { + oomUnsafe.crash("Irregexp Malloced shim"); + } + return result; + } + static void operator delete(void* p) { js_free(p); } +}; + +constexpr int32_t KB = 1024; +constexpr int32_t MB = 1024 * 1024; + +#define kMaxInt JSVAL_INT_MAX +#define kMinInt JSVAL_INT_MIN +constexpr int kSystemPointerSize = sizeof(void*); + +// The largest integer n such that n and n + 1 are both exactly +// representable as a Number value. ES6 section 20.1.2.6 +constexpr double kMaxSafeInteger = 9007199254740991.0; // 2^53-1 + +constexpr int kBitsPerByte = 8; +constexpr int kBitsPerByteLog2 = 3; +constexpr int kUInt16Size = sizeof(uint16_t); +constexpr int kUInt32Size = sizeof(uint32_t); +constexpr int kInt64Size = sizeof(int64_t); + +constexpr int kMaxUInt16 = (1 << 16) - 1; + +inline constexpr bool IsDecimalDigit(base::uc32 c) { + return c >= '0' && c <= '9'; +} + +inline constexpr int AsciiAlphaToLower(base::uc32 c) { return c | 0x20; } + +inline bool is_uint24(int64_t val) { return (val >> 24) == 0; } +inline bool is_int24(int64_t val) { + int64_t limit = int64_t(1) << 23; + return (-limit <= val) && (val < limit); +} + +inline bool IsIdentifierStart(base::uc32 c) { + return js::unicode::IsIdentifierStart(char32_t(c)); +} +inline bool IsIdentifierPart(base::uc32 c) { + return js::unicode::IsIdentifierPart(char32_t(c)); +} + +// Wrappers to disambiguate char16_t and uc16. +struct AsUC16 { + explicit AsUC16(char16_t v) : value(v) {} + char16_t value; +}; + +struct AsUC32 { + explicit AsUC32(int32_t v) : value(v) {} + int32_t value; +}; + +std::ostream& operator<<(std::ostream& os, const AsUC16& c); +std::ostream& operator<<(std::ostream& os, const AsUC32& c); + +// This class is used for the output of trace-regexp-parser. V8 has +// an elaborate implementation to ensure that the output gets to the +// right place, even on Android. We just need something that will +// print output (ideally to stderr, to match the rest of our tracing +// code). This is an empty wrapper that will convert itself to +// std::cerr when used. +class StdoutStream { + public: + operator std::ostream&() const; + template <typename T> + std::ostream& operator<<(T t); +}; + +// Reuse existing Maybe implementation +using mozilla::Maybe; + +template <typename T> +Maybe<T> Just(const T& value) { + return mozilla::Some(value); +} + +template <typename T> +mozilla::Nothing Nothing() { + return mozilla::Nothing(); +} + +template <typename T> +using PseudoHandle = mozilla::UniquePtr<T, JS::FreePolicy>; + +// Compare 8bit/16bit chars to 8bit/16bit chars. +// Used indirectly by regexp-interpreter.cc +// Taken from: https://github.com/v8/v8/blob/master/src/utils/utils.h +template <typename lchar, typename rchar> +inline int CompareCharsUnsigned(const lchar* lhs, const rchar* rhs, + size_t chars) { + const lchar* limit = lhs + chars; + if (sizeof(*lhs) == sizeof(char) && sizeof(*rhs) == sizeof(char)) { + // memcmp compares byte-by-byte, yielding wrong results for two-byte + // strings on little-endian systems. + return memcmp(lhs, rhs, chars); + } + while (lhs < limit) { + int r = static_cast<int>(*lhs) - static_cast<int>(*rhs); + if (r != 0) return r; + ++lhs; + ++rhs; + } + return 0; +} +template <typename lchar, typename rchar> +inline int CompareChars(const lchar* lhs, const rchar* rhs, size_t chars) { + DCHECK_LE(sizeof(lchar), 2); + DCHECK_LE(sizeof(rchar), 2); + if (sizeof(lchar) == 1) { + if (sizeof(rchar) == 1) { + return CompareCharsUnsigned(reinterpret_cast<const uint8_t*>(lhs), + reinterpret_cast<const uint8_t*>(rhs), chars); + } else { + return CompareCharsUnsigned(reinterpret_cast<const uint8_t*>(lhs), + reinterpret_cast<const char16_t*>(rhs), + chars); + } + } else { + if (sizeof(rchar) == 1) { + return CompareCharsUnsigned(reinterpret_cast<const char16_t*>(lhs), + reinterpret_cast<const uint8_t*>(rhs), chars); + } else { + return CompareCharsUnsigned(reinterpret_cast<const char16_t*>(lhs), + reinterpret_cast<const char16_t*>(rhs), + chars); + } + } +} + +// Compare 8bit/16bit chars to 8bit/16bit chars. +template <typename lchar, typename rchar> +inline bool CompareCharsEqualUnsigned(const lchar* lhs, const rchar* rhs, + size_t chars) { + STATIC_ASSERT(std::is_unsigned<lchar>::value); + STATIC_ASSERT(std::is_unsigned<rchar>::value); + if (sizeof(*lhs) == sizeof(*rhs)) { + // memcmp compares byte-by-byte, but for equality it doesn't matter whether + // two-byte char comparison is little- or big-endian. + return memcmp(lhs, rhs, chars * sizeof(*lhs)) == 0; + } + for (const lchar* limit = lhs + chars; lhs < limit; ++lhs, ++rhs) { + if (*lhs != *rhs) return false; + } + return true; +} + +template <typename lchar, typename rchar> +inline bool CompareCharsEqual(const lchar* lhs, const rchar* rhs, + size_t chars) { + using ulchar = typename std::make_unsigned<lchar>::type; + using urchar = typename std::make_unsigned<rchar>::type; + return CompareCharsEqualUnsigned(reinterpret_cast<const ulchar*>(lhs), + reinterpret_cast<const urchar*>(rhs), chars); +} + +// V8::Object ~= JS::Value +class Object { + public: + // The default object constructor in V8 stores a nullptr, + // which has its low bit clear and is interpreted as Smi(0). + constexpr Object() : asBits_(JS::Int32Value(0).asRawBits()) {} + + Object(const JS::Value& value) : asBits_(value.asRawBits()) {} + + // This constructor is only used in an unused implementation of + // IsCharacterInRangeArray in regexp-macro-assembler.cc. + Object(uintptr_t raw) : asBits_(raw) { MOZ_CRASH("unused"); } + + // Used in regexp-interpreter.cc to check the return value of + // isolate->stack_guard()->HandleInterrupts(). We want to handle + // interrupts in the caller, so we always return false from + // HandleInterrupts and true here. + inline bool IsException(Isolate*) const { + MOZ_ASSERT(!value().toBoolean()); + return true; + } + + JS::Value value() const { return JS::Value::fromRawBits(asBits_); } + + inline static Object cast(Object object) { return object; } + + protected: + void setValue(const JS::Value& val) { asBits_ = val.asRawBits(); } + uint64_t asBits_; +} JS_HAZ_GC_POINTER; + +class Smi : public Object { + public: + static Smi FromInt(int32_t value) { + Smi smi; + smi.setValue(JS::Int32Value(value)); + return smi; + } + static inline int32_t ToInt(const Object object) { + return object.value().toInt32(); + } +}; + +// V8::HeapObject ~= GC thing +class HeapObject : public Object { + public: + inline static HeapObject cast(Object object) { + HeapObject h; + h.setValue(object.value()); + return h; + } +}; + +// A fixed-size array with Objects (aka Values) as element types. +// Implemented using the dense elements of an ArrayObject. +// Used for named captures. +class FixedArray : public HeapObject { + public: + inline void set(uint32_t index, Object value) { + inner()->setDenseElement(index, value.value()); + } + inline static FixedArray cast(Object object) { + FixedArray f; + f.setValue(object.value()); + return f; + } + js::NativeObject* inner() { + return &value().toObject().as<js::NativeObject>(); + } +}; + +/* + * Conceptually, ByteArrayData is a variable-size structure. To + * implement this in a C++-approved way, we allocate a struct + * containing the 32-bit length field, followed by additional memory + * for the data. To access the data, we get a pointer to the next byte + * after the length field and cast it to the correct type. + */ +inline uint8_t* ByteArrayData::data() { + static_assert(alignof(uint8_t) <= alignof(ByteArrayData), + "The trailing data must be aligned to start immediately " + "after the header with no padding."); + ByteArrayData* immediatelyAfter = this + 1; + return reinterpret_cast<uint8_t*>(immediatelyAfter); +} + +template <typename T> +T* ByteArrayData::typedData() { + static_assert(alignof(T) <= alignof(ByteArrayData)); + MOZ_ASSERT(uintptr_t(data()) % alignof(T) == 0); + return reinterpret_cast<T*>(data()); +} + +template <typename T> +T ByteArrayData::getTyped(uint32_t index) { + MOZ_ASSERT(index < length / sizeof(T)); + return typedData<T>()[index]; +} + +template <typename T> +void ByteArrayData::setTyped(uint32_t index, T value) { + MOZ_ASSERT(index < length / sizeof(T)); + typedData<T>()[index] = value; +} + +// A fixed-size array of bytes. +class ByteArray : public HeapObject { + protected: + ByteArrayData* inner() const { + return static_cast<ByteArrayData*>(value().toPrivate()); + } + + public: + PseudoHandle<ByteArrayData> takeOwnership(Isolate* isolate); + PseudoHandle<ByteArrayData> maybeTakeOwnership(Isolate* isolate); + + byte get(uint32_t index) { return inner()->get(index); } + void set(uint32_t index, byte val) { inner()->set(index, val); } + + uint32_t length() const { return inner()->length; } + byte* GetDataStartAddress() { return inner()->data(); } + + static ByteArray cast(Object object) { + ByteArray b; + b.setValue(object.value()); + return b; + } + + bool IsByteArray() const { return true; } + + friend class SMRegExpMacroAssembler; +}; + +// This is a convenience class used in V8 for treating a ByteArray as an array +// of fixed-size integers. This version supports integral types up to 32 bits. +template <typename T> +class FixedIntegerArray : public ByteArray { + static_assert(alignof(T) <= alignof(ByteArrayData)); + static_assert(std::is_integral<T>::value); + + public: + static Handle<FixedIntegerArray<T>> New(Isolate* isolate, uint32_t length); + + T get(uint32_t index) { return inner()->template getTyped<T>(index); }; + void set(uint32_t index, T value) { + inner()->template setTyped<T>(index, value); + } + + static FixedIntegerArray<T> cast(Object object) { + FixedIntegerArray<T> f; + f.setValue(object.value()); + return f; + } +}; + +using FixedUInt16Array = FixedIntegerArray<uint16_t>; + +// Like Handles in SM, V8 handles are references to marked pointers. +// Unlike SM, where Rooted pointers are created individually on the +// stack, the target of a V8 handle lives in an arena on the isolate +// (~= JSContext). Whenever a Handle is created, a new "root" is +// created at the end of the arena. +// +// HandleScopes are used to manage the lifetimes of these handles. A +// HandleScope lives on the stack and stores the size of the arena at +// the time of its creation. When the function returns and the +// HandleScope is destroyed, the arena is truncated to its previous +// size, clearing all roots that were created since the creation of +// the HandleScope. +// +// In some cases, objects that are GC-allocated in V8 are not in SM. +// In particular, irregexp allocates ByteArrays during code generation +// to store lookup tables. This does not play nicely with the SM +// macroassembler's requirement that no GC allocations take place +// while it is on the stack. To work around this, this shim layer also +// provides the ability to create pseudo-handles, which are not +// managed by the GC but provide the same API to irregexp. The "root" +// of a pseudohandle is a unique pointer living in a second arena. If +// the allocated object should outlive the HandleScope, it must be +// manually moved out of the arena using maybeTakeOwnership. +// (If maybeTakeOwnership is called multiple times, it will return +// a null pointer on subsequent calls.) + +class MOZ_STACK_CLASS HandleScope { + public: + HandleScope(Isolate* isolate); + ~HandleScope(); + + private: + size_t level_ = 0; + size_t non_gc_level_ = 0; + Isolate* isolate_; + + friend class Isolate; +}; + +// Origin: +// https://github.com/v8/v8/blob/5792f3587116503fc047d2f68c951c72dced08a5/src/handles/handles.h#L88-L171 +template <typename T> +class MOZ_NONHEAP_CLASS Handle { + public: + Handle() : location_(nullptr) {} + Handle(T object, Isolate* isolate); + Handle(const JS::Value& value, Isolate* isolate); + + // Constructor for handling automatic up casting. + template <typename S, + typename = std::enable_if_t<std::is_convertible_v<S*, T*>>> + inline Handle(Handle<S> handle) : location_(handle.location_) {} + + inline bool is_null() const { return location_ == nullptr; } + + inline T operator*() const { return T::cast(Object(*location_)); }; + + // {ObjectRef} is returned by {Handle::operator->}. It should never be stored + // anywhere or used in any other code; no one should ever have to spell out + // {ObjectRef} in code. Its only purpose is to be dereferenced immediately by + // "operator-> chaining". Returning the address of the field is valid because + // this object's lifetime only ends at the end of the full statement. + // Origin: + // https://github.com/v8/v8/blob/03aaa4b3bf4cb01eee1f223b252e6869b04ab08c/src/handles/handles.h#L91-L105 + class MOZ_TEMPORARY_CLASS ObjectRef { + public: + T* operator->() { return &object_; } + + private: + friend class Handle; + explicit ObjectRef(T object) : object_(object) {} + + T object_; + }; + inline ObjectRef operator->() const { return ObjectRef{**this}; } + + static Handle<T> fromHandleValue(JS::HandleValue handle) { + return Handle(handle.address()); + } + + private: + Handle(const JS::Value* location) : location_(location) {} + + template <typename> + friend class Handle; + template <typename> + friend class MaybeHandle; + + const JS::Value* location_; +}; + +// A Handle can be converted into a MaybeHandle. Converting a MaybeHandle +// into a Handle requires checking that it does not point to nullptr. This +// ensures nullptr checks before use. +// +// Also note that Handles do not provide default equality comparison or hashing +// operators on purpose. Such operators would be misleading, because intended +// semantics is ambiguous between Handle location and object identity. +// Origin: +// https://github.com/v8/v8/blob/5792f3587116503fc047d2f68c951c72dced08a5/src/handles/maybe-handles.h#L15-L78 +template <typename T> +class MOZ_NONHEAP_CLASS MaybeHandle final { + public: + MaybeHandle() : location_(nullptr) {} + + // Constructor for handling automatic up casting from Handle. + // Ex. Handle<JSArray> can be passed when MaybeHandle<Object> is expected. + template <typename S, + typename = std::enable_if_t<std::is_convertible_v<S*, T*>>> + MaybeHandle(Handle<S> handle) : location_(handle.location_) {} + + inline Handle<T> ToHandleChecked() const { + MOZ_RELEASE_ASSERT(location_); + return Handle<T>(location_); + } + + // Convert to a Handle with a type that can be upcasted to. + template <typename S> + inline bool ToHandle(Handle<S>* out) const { + if (location_) { + *out = Handle<T>(location_); + return true; + } else { + *out = Handle<T>(); + return false; + } + } + + private: + JS::Value* location_; +}; + +// From v8/src/handles/handles-inl.h + +template <typename T> +inline Handle<T> handle(T object, Isolate* isolate) { + return Handle<T>(object, isolate); +} + +// RAII Guard classes + +using DisallowGarbageCollection = JS::AutoAssertNoGC; + +// V8 uses this inside DisallowGarbageCollection regions to turn +// allocation back on before throwing a stack overflow exception or +// handling interrupts. AutoSuppressGC is sufficient for the former +// case, but not for the latter: handling interrupts can execute +// arbitrary script code, and V8 jumps through some scary hoops to +// "manually relocate unhandlified references" afterwards. To keep +// things sane, we don't try to handle interrupts while regex code is +// still on the stack. Instead, we return EXCEPTION and handle +// interrupts in the caller. (See RegExpShared::execute.) + +class AllowGarbageCollection { + public: + AllowGarbageCollection() {} +}; + +// Origin: +// https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L83-L474 +class String : public HeapObject { + private: + JSString* str() const { return value().toString(); } + + public: + String() = default; + String(JSString* str) { setValue(JS::StringValue(str)); } + + operator JSString*() const { return str(); } + + // Max char codes. + static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; + static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; + static const int kMaxUtf16CodeUnit = 0xffff; + static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit; + static const base::uc32 kMaxCodePoint = 0x10ffff; + + MOZ_ALWAYS_INLINE int length() const { return str()->length(); } + bool IsFlat() { return str()->isLinear(); }; + + // Origin: + // https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L95-L152 + class FlatContent { + public: + FlatContent(JSLinearString* string, const DisallowGarbageCollection& no_gc) + : string_(string), no_gc_(no_gc) {} + inline bool IsOneByte() const { return string_->hasLatin1Chars(); } + inline bool IsTwoByte() const { return !string_->hasLatin1Chars(); } + + base::Vector<const uint8_t> ToOneByteVector() const { + MOZ_ASSERT(IsOneByte()); + return base::Vector<const uint8_t>(string_->latin1Chars(no_gc_), + string_->length()); + } + base::Vector<const base::uc16> ToUC16Vector() const { + MOZ_ASSERT(IsTwoByte()); + return base::Vector<const base::uc16>(string_->twoByteChars(no_gc_), + string_->length()); + } + void UnsafeDisableChecksumVerification() { + // Intentional no-op. See the comment for AllowGarbageCollection above. + } + + private: + const JSLinearString* string_; + const JS::AutoAssertNoGC& no_gc_; + }; + FlatContent GetFlatContent(const DisallowGarbageCollection& no_gc) { + MOZ_ASSERT(IsFlat()); + return FlatContent(&str()->asLinear(), no_gc); + } + + static Handle<String> Flatten(Isolate* isolate, Handle<String> string); + + inline static String cast(Object object) { + String s; + MOZ_ASSERT(object.value().isString()); + s.setValue(object.value()); + return s; + } + + inline static bool IsOneByteRepresentationUnderneath(String string) { + return string.str()->hasLatin1Chars(); + } + inline bool IsOneByteRepresentation() const { + return str()->hasLatin1Chars(); + } + + std::unique_ptr<char[]> ToCString(); + + template <typename Char> + base::Vector<const Char> GetCharVector( + const DisallowGarbageCollection& no_gc); +}; + +template <> +inline base::Vector<const uint8_t> String::GetCharVector( + const DisallowGarbageCollection& no_gc) { + String::FlatContent flat = GetFlatContent(no_gc); + MOZ_ASSERT(flat.IsOneByte()); + return flat.ToOneByteVector(); +} + +template <> +inline base::Vector<const base::uc16> String::GetCharVector( + const DisallowGarbageCollection& no_gc) { + String::FlatContent flat = GetFlatContent(no_gc); + MOZ_ASSERT(flat.IsTwoByte()); + return flat.ToUC16Vector(); +} + +class JSRegExp : public HeapObject { + public: + JSRegExp() : HeapObject() {} + JSRegExp(js::RegExpShared* re) { setValue(JS::PrivateGCThingValue(re)); } + + // ****************************************************** + // Methods that are called from inside the implementation + // ****************************************************** + void TierUpTick() { inner()->tierUpTick(); } + + Object bytecode(bool is_latin1) const { + return Object(JS::PrivateValue(inner()->getByteCode(is_latin1))); + } + + // TODO: should we expose this? + uint32_t backtrack_limit() const { return 0; } + + static JSRegExp cast(Object object) { + JSRegExp regexp; + js::gc::Cell* regexpShared = object.value().toGCThing(); + MOZ_ASSERT(regexpShared->is<js::RegExpShared>()); + regexp.setValue(JS::PrivateGCThingValue(regexpShared)); + return regexp; + } + + // Each capture (including the match itself) needs two registers. + static constexpr int RegistersForCaptureCount(int count) { + return (count + 1) * 2; + } + + inline uint32_t max_register_count() const { + return inner()->getMaxRegisters(); + } + + // ****************************** + // Static constants + // ****************************** + + static constexpr int kMaxCaptures = (1 << 15) - 1; + + static constexpr int kNoBacktrackLimit = 0; + + private: + js::RegExpShared* inner() const { + return value().toGCThing()->as<js::RegExpShared>(); + } +}; + +using RegExpFlags = JS::RegExpFlags; + +inline bool IsUnicode(RegExpFlags flags) { return flags.unicode(); } +inline bool IsGlobal(RegExpFlags flags) { return flags.global(); } +inline bool IsIgnoreCase(RegExpFlags flags) { return flags.ignoreCase(); } +inline bool IsMultiline(RegExpFlags flags) { return flags.multiline(); } +inline bool IsDotAll(RegExpFlags flags) { return flags.dotAll(); } +inline bool IsSticky(RegExpFlags flags) { return flags.sticky(); } + +// TODO: Support /v flag (bug 1713657) +inline bool IsUnicodeSets(RegExpFlags flags) { return false; } +inline bool IsEitherUnicode(RegExpFlags flags) { return flags.unicode(); } + +class Histogram { + public: + inline void AddSample(int sample) {} +}; + +class Counters { + public: + Histogram* regexp_backtracks() { return ®exp_backtracks_; } + + private: + Histogram regexp_backtracks_; +}; + +enum class AllocationType : uint8_t { + kYoung, // Allocate in the nursery + kOld, // Allocate in the tenured heap +}; + +using StackGuard = Isolate; +using Factory = Isolate; + +class Isolate { + public: + Isolate(JSContext* cx) : cx_(cx) {} + ~Isolate(); + bool init(); + + size_t sizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf) const; + + //********** Isolate code **********// + RegExpStack* regexp_stack() const { return regexpStack_; } + + // This is called from inside no-GC code. Instead of suppressing GC + // to allocate the error, we return false from Execute and call + // ReportOverRecursed in the caller. + void StackOverflow() {} + +#ifndef V8_INTL_SUPPORT + unibrow::Mapping<unibrow::Ecma262UnCanonicalize>* jsregexp_uncanonicalize() { + return &jsregexp_uncanonicalize_; + } + unibrow::Mapping<unibrow::Ecma262Canonicalize>* + regexp_macro_assembler_canonicalize() { + return ®exp_macro_assembler_canonicalize_; + } + unibrow::Mapping<unibrow::CanonicalizationRange>* jsregexp_canonrange() { + return &jsregexp_canonrange_; + } + + private: + unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize_; + unibrow::Mapping<unibrow::Ecma262Canonicalize> + regexp_macro_assembler_canonicalize_; + unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange_; +#endif // !V8_INTL_SUPPORT + + public: + // An empty stub for telemetry we don't support + void IncreaseTotalRegexpCodeGenerated(Handle<HeapObject> code) {} + + Counters* counters() { return &counters_; } + + //********** Factory code **********// + inline Factory* factory() { return this; } + + Handle<ByteArray> NewByteArray( + int length, AllocationType allocation = AllocationType::kYoung); + + // Allocates a fixed array initialized with undefined values. + Handle<FixedArray> NewFixedArray(int length); + + template <typename T> + Handle<FixedIntegerArray<T>> NewFixedIntegerArray(uint32_t length); + + template <typename Char> + Handle<String> InternalizeString(const base::Vector<const Char>& str); + + //********** Stack guard code **********// + inline StackGuard* stack_guard() { return this; } + + uintptr_t real_climit() { return cx_->stackLimit(JS::StackForSystemCode); } + + // This is called from inside no-GC code. V8 runs the interrupt + // inside the no-GC code and then "manually relocates unhandlified + // references" afterwards. We just return false and let the caller + // handle interrupts. + Object HandleInterrupts() { return Object(JS::BooleanValue(false)); } + + JSContext* cx() const { return cx_; } + + void trace(JSTracer* trc); + + //********** Handle code **********// + + JS::Value* getHandleLocation(const JS::Value& value); + + private: + mozilla::SegmentedVector<JS::Value, 256> handleArena_; + mozilla::SegmentedVector<PseudoHandle<void>, 256> uniquePtrArena_; + + void* allocatePseudoHandle(size_t bytes); + + public: + template <typename T> + PseudoHandle<T> takeOwnership(void* ptr); + template <typename T> + PseudoHandle<T> maybeTakeOwnership(void* ptr); + + uint32_t liveHandles() const { return handleArena_.Length(); } + uint32_t livePseudoHandles() const { return uniquePtrArena_.Length(); } + + private: + void openHandleScope(HandleScope& scope) { + scope.level_ = handleArena_.Length(); + scope.non_gc_level_ = uniquePtrArena_.Length(); + } + void closeHandleScope(size_t prevLevel, size_t prevUniqueLevel) { + size_t currLevel = handleArena_.Length(); + handleArena_.PopLastN(currLevel - prevLevel); + + size_t currUniqueLevel = uniquePtrArena_.Length(); + uniquePtrArena_.PopLastN(currUniqueLevel - prevUniqueLevel); + } + friend class HandleScope; + + JSContext* cx_; + RegExpStack* regexpStack_{}; + Counters counters_{}; +#ifdef DEBUG + public: + uint32_t shouldSimulateInterrupt_ = 0; +#endif +}; + +// Origin: +// https://github.com/v8/v8/blob/50dcf2af54ce27801a71c47c1be1d2c5e36b0dd6/src/execution/isolate.h#L1909-L1931 +class StackLimitCheck { + public: + StackLimitCheck(Isolate* isolate) : cx_(isolate->cx()) {} + + // Use this to check for stack-overflows in C++ code. + bool HasOverflowed() { + js::AutoCheckRecursionLimit recursion(cx_); + bool overflowed = !recursion.checkDontReport(cx_); + if (overflowed && js::SupportDifferentialTesting()) { + // We don't report overrecursion here, but we throw an exception later + // and this still affects differential testing. Mimic ReportOverRecursed + // (the fuzzers check for this particular string). + fprintf(stderr, "ReportOverRecursed called\n"); + } + return overflowed; + } + + // Use this to check for interrupt request in C++ code. + bool InterruptRequested() { + return cx_->hasPendingInterrupt(js::InterruptReason::CallbackUrgent); + } + + // Use this to check for stack-overflow when entering runtime from JS code. + bool JsHasOverflowed() { + js::AutoCheckRecursionLimit recursion(cx_); + return !recursion.checkDontReport(cx_); + } + + private: + JSContext* cx_; +}; + +class ExternalReference { + public: + static const void* TopOfRegexpStack(Isolate* isolate); + static size_t SizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf, + RegExpStack* regexpStack); +}; + +class Code : public HeapObject { + public: + uint8_t* raw_instruction_start() { return inner()->raw(); } + + static Code cast(Object object) { + Code c; + js::gc::Cell* jitCode = object.value().toGCThing(); + MOZ_ASSERT(jitCode->is<js::jit::JitCode>()); + c.setValue(JS::PrivateGCThingValue(jitCode)); + return c; + } + js::jit::JitCode* inner() { + return value().toGCThing()->as<js::jit::JitCode>(); + } +}; + +// Only used in function signature of functions we don't implement +// (NativeRegExpMacroAssembler::CheckStackGuardState) +class InstructionStream {}; + +// Origin: https://github.com/v8/v8/blob/master/src/codegen/label.h +class Label { + public: + Label() : inner_(js::jit::Label()) {} + + js::jit::Label* inner() { return &inner_; } + + void Unuse() { inner_.reset(); } + + bool is_linked() { return inner_.used(); } + bool is_bound() { return inner_.bound(); } + bool is_unused() { return !inner_.used() && !inner_.bound(); } + + int pos() { return inner_.offset(); } + void link_to(int pos) { inner_.use(pos); } + void bind_to(int pos) { inner_.bind(pos); } + + private: + js::jit::Label inner_; + js::jit::CodeOffset patchOffset_; + + friend class SMRegExpMacroAssembler; +}; + +#define v8_flags js::jit::JitOptions + +#define V8_USE_COMPUTED_GOTO 1 +#define COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +} // namespace internal +} // namespace v8 + +namespace V8 { + +inline void FatalProcessOutOfMemory(v8::internal::Isolate* isolate, + const char* msg) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + oomUnsafe.crash(msg); +} + +} // namespace V8 + +#endif // RegexpShim_h diff --git a/js/src/irregexp/RegExpTypes.h b/js/src/irregexp/RegExpTypes.h new file mode 100644 index 0000000000..e2a619689c --- /dev/null +++ b/js/src/irregexp/RegExpTypes.h @@ -0,0 +1,100 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// This file forward-defines Irregexp classes that need to be visible +// to the rest of Spidermonkey and re-exports them into js::irregexp. + +#ifndef regexp_RegExpTypes_h +#define regexp_RegExpTypes_h + +#include "js/UniquePtr.h" + +namespace js { +class MatchPairs; +} + +namespace v8 { +namespace internal { + +class ByteArrayData { + public: + uint32_t length; + uint8_t* data(); + + uint8_t get(uint32_t index) { + MOZ_ASSERT(index < length); + return data()[index]; + } + void set(uint32_t index, uint8_t val) { + MOZ_ASSERT(index < length); + data()[index] = val; + } + + // Used for FixedIntegerArray. + template <typename T> + T getTyped(uint32_t index); + template <typename T> + void setTyped(uint32_t index, T value); + + private: + template <typename T> + T* typedData(); +}; + +class Isolate; +class RegExpStack; +class RegExpStackScope; + +struct InputOutputData { + const void* inputStart; + const void* inputEnd; + + // Index into inputStart (in chars) at which to begin matching. + size_t startIndex; + + js::MatchPairs* matches; + + template <typename CharT> + InputOutputData(const CharT* inputStart, const CharT* inputEnd, + size_t startIndex, js::MatchPairs* matches) + : inputStart(inputStart), + inputEnd(inputEnd), + startIndex(startIndex), + matches(matches) {} + + // Note: return int32_t instead of size_t to prevent signed => unsigned + // conversions in caller functions. + static constexpr int32_t offsetOfInputStart() { + return int32_t(offsetof(InputOutputData, inputStart)); + } + static constexpr int32_t offsetOfInputEnd() { + return int32_t(offsetof(InputOutputData, inputEnd)); + } + static constexpr int32_t offsetOfStartIndex() { + return int32_t(offsetof(InputOutputData, startIndex)); + } + static constexpr int32_t offsetOfMatches() { + return int32_t(offsetof(InputOutputData, matches)); + } +}; + +} // namespace internal +} // namespace v8 + +namespace js { +namespace irregexp { + +using Isolate = v8::internal::Isolate; +using RegExpStack = v8::internal::RegExpStack; +using RegExpStackScope = v8::internal::RegExpStackScope; +using ByteArrayData = v8::internal::ByteArrayData; +using ByteArray = js::UniquePtr<v8::internal::ByteArrayData, JS::FreePolicy>; +using InputOutputData = v8::internal::InputOutputData; + +} // namespace irregexp +} // namespace js + +#endif // regexp_RegExpTypes_h diff --git a/js/src/irregexp/import-irregexp.py b/js/src/irregexp/import-irregexp.py new file mode 100755 index 0000000000..523276e7b3 --- /dev/null +++ b/js/src/irregexp/import-irregexp.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +# This script handles all the mechanical steps of importing irregexp from v8: +# +# 1. Acquire the source: either from github, or optionally from a local copy of v8. +# 2. Copy the contents of v8/src/regexp into js/src/irregexp/imported +# - Exclude files that we have chosen not to import. +# 3. While doing so, update #includes: +# - Change "src/regexp/*" to "irregexp/imported/*". +# - Remove other v8-specific headers completely. +# 4. Add '#include "irregexp/RegExpShim.h" in the necessary places. +# 5. Update the IRREGEXP_VERSION file to include the correct git hash. +# +# Usage: +# cd path/to/js/src/irregexp +# ./import-irregexp.py --path path/to/v8/src/regexp +# +# Alternatively, without the --path argument, import-irregexp.py will +# clone v8 from github into a temporary directory. +# +# After running this script, changes to the shim code may be necessary +# to account for changes in upstream irregexp. + +import os +import re +import subprocess +import sys +from pathlib import Path + + +def copy_and_update_includes(src_path, dst_path): + # List of header files that need to include the shim header + need_shim = [ + "property-sequences.h", + "regexp-ast.h", + "regexp-bytecode-peephole.h", + "regexp-bytecodes.h", + "regexp-dotprinter.h", + "regexp-error.h", + "regexp.h", + "regexp-macro-assembler.h", + "regexp-parser.h", + "regexp-stack.h", + "special-case.h", + ] + + src = open(str(src_path), "r") + dst = open(str(dst_path), "w") + + # 1. Rewrite includes of V8 regexp headers: + # Note that we exclude regexp-flags.h and provide our own definition. + regexp_include = re.compile('#include "src/regexp(?!/regexp-flags.h)') + regexp_include_new = '#include "irregexp/imported' + + # 2. Remove includes of other V8 headers + other_include = re.compile('#include "src/') + + # 3. If needed, add '#include "irregexp/RegExpShim.h"'. + # Note: We get a little fancy to ensure that header files are + # in alphabetic order. `need_to_add_shim` is true if we still + # have to add the shim header in this file. `adding_shim_now` + # is true if we have found a '#include "src/*' and we are just + # waiting to find an empty line so that we can insert the shim + # header in the right place. + need_to_add_shim = src_path.name in need_shim + adding_shim_now = False + + for line in src: + if adding_shim_now: + if line == "\n": + dst.write('#include "irregexp/RegExpShim.h"\n') + need_to_add_shim = False + adding_shim_now = False + + if regexp_include.search(line): + dst.write(re.sub(regexp_include, regexp_include_new, line)) + elif other_include.search(line): + if need_to_add_shim: + adding_shim_now = True + else: + dst.write(line) + + +def import_from(srcdir, dstdir): + excluded = [ + "DIR_METADATA", + "OWNERS", + "regexp.cc", + "regexp-flags.h", + "regexp-utils.cc", + "regexp-utils.h", + "regexp-macro-assembler-arch.h", + ] + + for file in srcdir.iterdir(): + if file.is_dir(): + continue + if str(file.name) in excluded: + continue + copy_and_update_includes(file, dstdir / "imported" / file.name) + + +if __name__ == "__main__": + import argparse + import tempfile + + # This script should be run from js/src/irregexp to work correctly. + current_path = Path(os.getcwd()) + expected_path = "js/src/irregexp" + if not current_path.match(expected_path): + raise RuntimeError("%s must be run from %s" % (sys.argv[0], expected_path)) + + parser = argparse.ArgumentParser(description="Import irregexp from v8") + parser.add_argument("-p", "--path", help="path to v8/src/regexp", required=False) + args = parser.parse_args() + + if args.path: + src_path = Path(args.path) + provided_path = "the command-line" + elif "TASK_ID" in os.environ: + src_path = Path("/builds/worker/v8/") + subprocess.run("git pull origin master", shell=True, cwd=src_path) + + src_path = Path("/builds/worker/v8/src/regexp") + provided_path = "the hardcoded path in the taskcluster image" + elif "V8_GIT" in os.environ: + src_path = Path(os.environ["V8_GIT"]) + provided_path = "the V8_GIT environment variable" + else: + tempdir = tempfile.TemporaryDirectory() + v8_git = "https://github.com/v8/v8.git" + clone = "git clone --depth 1 %s %s" % (v8_git, tempdir.name) + os.system(clone) + src_path = Path(tempdir.name) / "src/regexp" + provided_path = "the temporary git checkout" + + if not (src_path / "regexp.h").exists(): + print("Could not find regexp.h in the path provided from", provided_path) + print("Usage:\n import-irregexp.py [--path <path/to/v8/src/regexp>]") + sys.exit(1) + + if "MACH_VENDOR" not in os.environ: + print( + "Running this script outside ./mach vendor is not recommended - ", + "You will need to update moz.yaml manually", + ) + print("We recommend instead `./mach vendor js/src/irregexp/moz.yaml`") + response = input("Type Y to continue... ") + if response.lower() != "y": + sys.exit(1) + + import_from(src_path, current_path) diff --git a/js/src/irregexp/imported/gen-regexp-special-case.cc b/js/src/irregexp/imported/gen-regexp-special-case.cc new file mode 100644 index 0000000000..8f6557ed30 --- /dev/null +++ b/js/src/irregexp/imported/gen-regexp-special-case.cc @@ -0,0 +1,214 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <fstream> +#include <iomanip> +#include <iostream> +#include <sstream> + +#include "irregexp/imported/special-case.h" +#include "unicode/usetiter.h" + +namespace v8 { +namespace internal { + +static const base::uc32 kSurrogateStart = 0xd800; +static const base::uc32 kSurrogateEnd = 0xdfff; +static const base::uc32 kNonBmpStart = 0x10000; + +// The following code generates "src/regexp/special-case.cc". +void PrintSet(std::ofstream& out, const char* name, + const icu::UnicodeSet& set) { + out << "icu::UnicodeSet Build" << name << "() {\n" + << " icu::UnicodeSet set;\n"; + for (int32_t i = 0; i < set.getRangeCount(); i++) { + if (set.getRangeStart(i) == set.getRangeEnd(i)) { + out << " set.add(0x" << set.getRangeStart(i) << ");\n"; + } else { + out << " set.add(0x" << set.getRangeStart(i) << ", 0x" + << set.getRangeEnd(i) << ");\n"; + } + } + out << " set.freeze();\n" + << " return set;\n" + << "}\n\n"; + + out << "struct " << name << "Data {\n" + << " " << name << "Data() : set(Build" << name << "()) {}\n" + << " const icu::UnicodeSet set;\n" + << "};\n\n"; + + out << "//static\n" + << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" + << " static base::LazyInstance<" << name << "Data>::type set =\n" + << " LAZY_INSTANCE_INITIALIZER;\n" + << " return set.Pointer()->set;\n" + << "}\n\n"; +} + +void PrintSpecial(std::ofstream& out) { + icu::UnicodeSet current; + icu::UnicodeSet special_add; + icu::UnicodeSet ignore; + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeSet upper("[\\p{Lu}]", status); + CHECK(U_SUCCESS(status)); + + // Iterate through all chars in BMP except surrogates. + for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { + if (i >= static_cast<UChar32>(kSurrogateStart) && + i <= static_cast<UChar32>(kSurrogateEnd)) { + continue; // Ignore surrogate range + } + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Check to see if all characters in the case-folding equivalence + // class as defined by UnicodeSet::closeOver all map to the same + // canonical value. + UChar32 canonical = RegExpCaseFolding::Canonicalize(i); + bool class_has_matching_canonical_char = false; + bool class_has_non_matching_canonical_char = false; + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); + c++) { + if (c == i) { + continue; + } + UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); + if (canonical == other_canonical) { + class_has_matching_canonical_char = true; + } else { + class_has_non_matching_canonical_char = true; + } + } + } + // If any other character in i's equivalence class has a + // different canonical value, then i needs special handling. If + // no other character shares a canonical value with i, we can + // ignore i when adding alternatives for case-independent + // comparison. If at least one other character shares a + // canonical value, then i needs special handling. + if (class_has_non_matching_canonical_char) { + if (class_has_matching_canonical_char) { + special_add.add(i); + } else { + ignore.add(i); + } + } + } + + // Verify that no Unicode equivalence class contains two non-trivial + // JS equivalence classes. Every character in SpecialAddSet has the + // same canonical value as every other non-IgnoreSet character in + // its Unicode equivalence class. Therefore, if we call closeOver on + // a set containing no IgnoreSet characters, the only characters + // that must be removed from the result are in IgnoreSet. This fact + // is used in CharacterRange::AddCaseEquivalents. + for (int32_t i = 0; i < special_add.getRangeCount(); i++) { + for (UChar32 c = special_add.getRangeStart(i); + c <= special_add.getRangeEnd(i); c++) { + UChar32 canonical = RegExpCaseFolding::Canonicalize(c); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE); + current.removeAll(ignore); + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c2 = current.getRangeStart(j); + c2 <= current.getRangeEnd(j); c2++) { + CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); + } + } + } + } + + PrintSet(out, "IgnoreSet", ignore); + PrintSet(out, "SpecialAddSet", special_add); +} + +void PrintUnicodeSpecial(std::ofstream& out) { + icu::UnicodeSet non_simple_folding; + icu::UnicodeSet current; + UErrorCode status = U_ZERO_ERROR; + // Look at all characters except white spaces. + icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status); + CHECK_EQ(status, U_ZERO_ERROR); + icu::UnicodeSetIterator iter(interestingCP); + while (iter.next()) { + UChar32 c = iter.getCodepoint(); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings(); + CHECK(!current.isBogus()); + // Remove characters from the closeover that have a simple case folding. + icu::UnicodeSet toRemove; + icu::UnicodeSetIterator closeOverIter(current); + while (closeOverIter.next()) { + UChar32 closeOverChar = closeOverIter.getCodepoint(); + UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT); + if (closeOverChar != closeOverSCF) { + toRemove.add(closeOverChar); + } + } + CHECK(!toRemove.isBogus()); + current.removeAll(toRemove); + + // The current character and its simple case folding are also always OK. + UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT); + current.remove(c); + current.remove(scf); + + // If there are any characters remaining, they were added due to full case + // foldings and shouldn't match the current charcter according to the spec. + if (!current.isEmpty()) { + // Ensure that the character doesn't have a simple case folding. + // Otherwise the current approach of simply removing the character from + // the set before calling closeOver won't work. + CHECK_EQ(c, scf); + non_simple_folding.add(c); + } + } + CHECK(!non_simple_folding.isBogus()); + + PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding); +} + +void WriteHeader(const char* header_filename) { + std::ofstream out(header_filename); + out << std::hex << std::setfill('0') << std::setw(4); + out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" + << "// Use of this source code is governed by a BSD-style license that\n" + << "// can be found in the LICENSE file.\n\n" + << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" + << "// The following functions are used to build UnicodeSets\n" + << "// for special cases where the case-folding algorithm used by\n" + << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" + << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" + << "// Semantics: Canonicalize) step 3.\n\n" + << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/base/lazy-instance.h\"\n\n" + << "#include \"src/regexp/special-case.h\"\n\n" + << "#include \"unicode/uniset.h\"\n" + << "namespace v8 {\n" + << "namespace internal {\n\n"; + + PrintSpecial(out); + PrintUnicodeSpecial(out); + + out << "\n" + << "} // namespace internal\n" + << "} // namespace v8\n" + << "#endif // V8_INTL_SUPPORT\n"; +} + +} // namespace internal +} // namespace v8 + +int main(int argc, const char** argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " <output filename>\n"; + std::exit(1); + } + v8::internal::WriteHeader(argv[1]); + + return 0; +} diff --git a/js/src/irregexp/imported/property-sequences.cc b/js/src/irregexp/imported/property-sequences.cc new file mode 100644 index 0000000000..b37ec63115 --- /dev/null +++ b/js/src/irregexp/imported/property-sequences.cc @@ -0,0 +1,1246 @@ +// Copyright 2018 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifdef V8_INTL_SUPPORT + +#include "irregexp/imported/property-sequences.h" + +namespace v8 { +namespace internal { + +/* +Generated from following Node.js source: + +package.json + +``` +{ + "private": true, + "dependencies": { + "unicode-12.0.0": "^0.7.9" + } +} +``` + +generate-unicode-sequence-property-data.js + +``` +const toHex = (symbol) => { + return '0x' + symbol.codePointAt(0).toString(16) + .toUpperCase().padStart(6, '0'); +}; + +const generateData = (property) => { + const sequences = + require(`unicode-12.0.0/Sequence_Property/${ property }/index.js`); + const id = property.replace(/_/g, '') + 's'; + const buffer = []; + for (const sequence of sequences) { + const symbols = [...sequence]; + const codePoints = symbols.map(symbol => toHex(symbol)); + buffer.push(' ' + codePoints.join(', ') + ', 0,'); + } + const output = + `const base::uc32 UnicodePropertySequences::k${ id }[] = {\n` + + `${ buffer.join('\n') }\n 0 // null-terminating the list\n};\n`; + return output; +}; + +const properties = [ + 'Emoji_Flag_Sequence', + 'Emoji_Tag_Sequence', + 'Emoji_ZWJ_Sequence', +]; + +for (const property of properties) { + console.log(generateData(property)); +} +``` +*/ + +// clang-format off +const base::uc32 UnicodePropertySequences::kEmojiFlagSequences[] = { + 0x01F1E6, 0x01F1E8, 0, + 0x01F1FF, 0x01F1FC, 0, + 0x01F1E6, 0x01F1EA, 0, + 0x01F1E6, 0x01F1EB, 0, + 0x01F1E6, 0x01F1EC, 0, + 0x01F1E6, 0x01F1EE, 0, + 0x01F1E6, 0x01F1F1, 0, + 0x01F1E6, 0x01F1F2, 0, + 0x01F1E6, 0x01F1F4, 0, + 0x01F1E6, 0x01F1F6, 0, + 0x01F1E6, 0x01F1F7, 0, + 0x01F1E6, 0x01F1F8, 0, + 0x01F1E6, 0x01F1F9, 0, + 0x01F1E6, 0x01F1FA, 0, + 0x01F1E6, 0x01F1FC, 0, + 0x01F1E6, 0x01F1FD, 0, + 0x01F1E6, 0x01F1FF, 0, + 0x01F1E7, 0x01F1E6, 0, + 0x01F1E7, 0x01F1E7, 0, + 0x01F1E7, 0x01F1E9, 0, + 0x01F1E7, 0x01F1EA, 0, + 0x01F1E7, 0x01F1EB, 0, + 0x01F1E7, 0x01F1EC, 0, + 0x01F1E7, 0x01F1ED, 0, + 0x01F1E7, 0x01F1EE, 0, + 0x01F1E7, 0x01F1EF, 0, + 0x01F1E7, 0x01F1F1, 0, + 0x01F1E7, 0x01F1F2, 0, + 0x01F1E7, 0x01F1F3, 0, + 0x01F1E7, 0x01F1F4, 0, + 0x01F1E7, 0x01F1F6, 0, + 0x01F1E7, 0x01F1F7, 0, + 0x01F1E7, 0x01F1F8, 0, + 0x01F1E7, 0x01F1F9, 0, + 0x01F1E7, 0x01F1FB, 0, + 0x01F1E7, 0x01F1FC, 0, + 0x01F1E7, 0x01F1FE, 0, + 0x01F1E7, 0x01F1FF, 0, + 0x01F1E8, 0x01F1E6, 0, + 0x01F1E8, 0x01F1E8, 0, + 0x01F1E8, 0x01F1E9, 0, + 0x01F1E8, 0x01F1EB, 0, + 0x01F1E8, 0x01F1EC, 0, + 0x01F1E8, 0x01F1ED, 0, + 0x01F1E8, 0x01F1EE, 0, + 0x01F1E8, 0x01F1F0, 0, + 0x01F1E8, 0x01F1F1, 0, + 0x01F1E8, 0x01F1F2, 0, + 0x01F1E8, 0x01F1F3, 0, + 0x01F1E8, 0x01F1F4, 0, + 0x01F1E8, 0x01F1F5, 0, + 0x01F1E8, 0x01F1F7, 0, + 0x01F1E8, 0x01F1FA, 0, + 0x01F1E8, 0x01F1FB, 0, + 0x01F1E8, 0x01F1FC, 0, + 0x01F1E8, 0x01F1FD, 0, + 0x01F1E8, 0x01F1FE, 0, + 0x01F1E8, 0x01F1FF, 0, + 0x01F1E9, 0x01F1EA, 0, + 0x01F1E9, 0x01F1EC, 0, + 0x01F1E9, 0x01F1EF, 0, + 0x01F1E9, 0x01F1F0, 0, + 0x01F1E9, 0x01F1F2, 0, + 0x01F1E9, 0x01F1F4, 0, + 0x01F1E9, 0x01F1FF, 0, + 0x01F1EA, 0x01F1E6, 0, + 0x01F1EA, 0x01F1E8, 0, + 0x01F1EA, 0x01F1EA, 0, + 0x01F1EA, 0x01F1EC, 0, + 0x01F1EA, 0x01F1ED, 0, + 0x01F1EA, 0x01F1F7, 0, + 0x01F1EA, 0x01F1F8, 0, + 0x01F1EA, 0x01F1F9, 0, + 0x01F1EA, 0x01F1FA, 0, + 0x01F1EB, 0x01F1EE, 0, + 0x01F1EB, 0x01F1EF, 0, + 0x01F1EB, 0x01F1F0, 0, + 0x01F1EB, 0x01F1F2, 0, + 0x01F1EB, 0x01F1F4, 0, + 0x01F1EB, 0x01F1F7, 0, + 0x01F1EC, 0x01F1E6, 0, + 0x01F1EC, 0x01F1E7, 0, + 0x01F1EC, 0x01F1E9, 0, + 0x01F1EC, 0x01F1EA, 0, + 0x01F1EC, 0x01F1EB, 0, + 0x01F1EC, 0x01F1EC, 0, + 0x01F1EC, 0x01F1ED, 0, + 0x01F1EC, 0x01F1EE, 0, + 0x01F1EC, 0x01F1F1, 0, + 0x01F1EC, 0x01F1F2, 0, + 0x01F1EC, 0x01F1F3, 0, + 0x01F1EC, 0x01F1F5, 0, + 0x01F1EC, 0x01F1F6, 0, + 0x01F1EC, 0x01F1F7, 0, + 0x01F1EC, 0x01F1F8, 0, + 0x01F1EC, 0x01F1F9, 0, + 0x01F1EC, 0x01F1FA, 0, + 0x01F1EC, 0x01F1FC, 0, + 0x01F1EC, 0x01F1FE, 0, + 0x01F1ED, 0x01F1F0, 0, + 0x01F1ED, 0x01F1F2, 0, + 0x01F1ED, 0x01F1F3, 0, + 0x01F1ED, 0x01F1F7, 0, + 0x01F1ED, 0x01F1F9, 0, + 0x01F1ED, 0x01F1FA, 0, + 0x01F1EE, 0x01F1E8, 0, + 0x01F1EE, 0x01F1E9, 0, + 0x01F1EE, 0x01F1EA, 0, + 0x01F1EE, 0x01F1F1, 0, + 0x01F1EE, 0x01F1F2, 0, + 0x01F1EE, 0x01F1F3, 0, + 0x01F1EE, 0x01F1F4, 0, + 0x01F1EE, 0x01F1F6, 0, + 0x01F1EE, 0x01F1F7, 0, + 0x01F1EE, 0x01F1F8, 0, + 0x01F1EE, 0x01F1F9, 0, + 0x01F1EF, 0x01F1EA, 0, + 0x01F1EF, 0x01F1F2, 0, + 0x01F1EF, 0x01F1F4, 0, + 0x01F1EF, 0x01F1F5, 0, + 0x01F1F0, 0x01F1EA, 0, + 0x01F1F0, 0x01F1EC, 0, + 0x01F1F0, 0x01F1ED, 0, + 0x01F1F0, 0x01F1EE, 0, + 0x01F1F0, 0x01F1F2, 0, + 0x01F1F0, 0x01F1F3, 0, + 0x01F1F0, 0x01F1F5, 0, + 0x01F1F0, 0x01F1F7, 0, + 0x01F1F0, 0x01F1FC, 0, + 0x01F1E6, 0x01F1E9, 0, + 0x01F1F0, 0x01F1FF, 0, + 0x01F1F1, 0x01F1E6, 0, + 0x01F1F1, 0x01F1E7, 0, + 0x01F1F1, 0x01F1E8, 0, + 0x01F1F1, 0x01F1EE, 0, + 0x01F1F1, 0x01F1F0, 0, + 0x01F1F1, 0x01F1F7, 0, + 0x01F1F1, 0x01F1F8, 0, + 0x01F1F1, 0x01F1F9, 0, + 0x01F1F1, 0x01F1FA, 0, + 0x01F1F1, 0x01F1FB, 0, + 0x01F1F1, 0x01F1FE, 0, + 0x01F1F2, 0x01F1E6, 0, + 0x01F1F2, 0x01F1E8, 0, + 0x01F1F2, 0x01F1E9, 0, + 0x01F1F2, 0x01F1EA, 0, + 0x01F1F2, 0x01F1EB, 0, + 0x01F1F2, 0x01F1EC, 0, + 0x01F1F2, 0x01F1ED, 0, + 0x01F1F2, 0x01F1F0, 0, + 0x01F1F2, 0x01F1F1, 0, + 0x01F1F2, 0x01F1F2, 0, + 0x01F1F2, 0x01F1F3, 0, + 0x01F1F2, 0x01F1F4, 0, + 0x01F1F2, 0x01F1F5, 0, + 0x01F1F2, 0x01F1F6, 0, + 0x01F1F2, 0x01F1F7, 0, + 0x01F1F2, 0x01F1F8, 0, + 0x01F1F2, 0x01F1F9, 0, + 0x01F1F2, 0x01F1FA, 0, + 0x01F1F2, 0x01F1FB, 0, + 0x01F1F2, 0x01F1FC, 0, + 0x01F1F2, 0x01F1FD, 0, + 0x01F1F2, 0x01F1FE, 0, + 0x01F1F2, 0x01F1FF, 0, + 0x01F1F3, 0x01F1E6, 0, + 0x01F1F3, 0x01F1E8, 0, + 0x01F1F3, 0x01F1EA, 0, + 0x01F1F3, 0x01F1EB, 0, + 0x01F1F3, 0x01F1EC, 0, + 0x01F1F3, 0x01F1EE, 0, + 0x01F1F3, 0x01F1F1, 0, + 0x01F1F3, 0x01F1F4, 0, + 0x01F1F3, 0x01F1F5, 0, + 0x01F1F3, 0x01F1F7, 0, + 0x01F1F3, 0x01F1FA, 0, + 0x01F1F3, 0x01F1FF, 0, + 0x01F1F4, 0x01F1F2, 0, + 0x01F1F5, 0x01F1E6, 0, + 0x01F1F5, 0x01F1EA, 0, + 0x01F1F5, 0x01F1EB, 0, + 0x01F1F5, 0x01F1EC, 0, + 0x01F1F5, 0x01F1ED, 0, + 0x01F1F5, 0x01F1F0, 0, + 0x01F1F5, 0x01F1F1, 0, + 0x01F1F5, 0x01F1F2, 0, + 0x01F1F5, 0x01F1F3, 0, + 0x01F1F5, 0x01F1F7, 0, + 0x01F1F5, 0x01F1F8, 0, + 0x01F1F5, 0x01F1F9, 0, + 0x01F1F5, 0x01F1FC, 0, + 0x01F1F5, 0x01F1FE, 0, + 0x01F1F6, 0x01F1E6, 0, + 0x01F1F7, 0x01F1EA, 0, + 0x01F1F7, 0x01F1F4, 0, + 0x01F1F7, 0x01F1F8, 0, + 0x01F1F7, 0x01F1FA, 0, + 0x01F1F7, 0x01F1FC, 0, + 0x01F1F8, 0x01F1E6, 0, + 0x01F1F8, 0x01F1E7, 0, + 0x01F1F8, 0x01F1E8, 0, + 0x01F1F8, 0x01F1E9, 0, + 0x01F1F8, 0x01F1EA, 0, + 0x01F1F8, 0x01F1EC, 0, + 0x01F1F8, 0x01F1ED, 0, + 0x01F1F8, 0x01F1EE, 0, + 0x01F1F8, 0x01F1EF, 0, + 0x01F1F8, 0x01F1F0, 0, + 0x01F1F8, 0x01F1F1, 0, + 0x01F1F8, 0x01F1F2, 0, + 0x01F1F8, 0x01F1F3, 0, + 0x01F1F8, 0x01F1F4, 0, + 0x01F1F8, 0x01F1F7, 0, + 0x01F1F8, 0x01F1F8, 0, + 0x01F1F8, 0x01F1F9, 0, + 0x01F1F8, 0x01F1FB, 0, + 0x01F1F8, 0x01F1FD, 0, + 0x01F1F8, 0x01F1FE, 0, + 0x01F1F8, 0x01F1FF, 0, + 0x01F1F9, 0x01F1E6, 0, + 0x01F1F9, 0x01F1E8, 0, + 0x01F1F9, 0x01F1E9, 0, + 0x01F1F9, 0x01F1EB, 0, + 0x01F1F9, 0x01F1EC, 0, + 0x01F1F9, 0x01F1ED, 0, + 0x01F1F9, 0x01F1EF, 0, + 0x01F1F9, 0x01F1F0, 0, + 0x01F1F9, 0x01F1F1, 0, + 0x01F1F9, 0x01F1F2, 0, + 0x01F1F9, 0x01F1F3, 0, + 0x01F1F9, 0x01F1F4, 0, + 0x01F1F9, 0x01F1F7, 0, + 0x01F1F9, 0x01F1F9, 0, + 0x01F1F9, 0x01F1FB, 0, + 0x01F1F9, 0x01F1FC, 0, + 0x01F1F9, 0x01F1FF, 0, + 0x01F1FA, 0x01F1E6, 0, + 0x01F1FA, 0x01F1EC, 0, + 0x01F1FA, 0x01F1F2, 0, + 0x01F1FA, 0x01F1F3, 0, + 0x01F1FA, 0x01F1F8, 0, + 0x01F1FA, 0x01F1FE, 0, + 0x01F1FA, 0x01F1FF, 0, + 0x01F1FB, 0x01F1E6, 0, + 0x01F1FB, 0x01F1E8, 0, + 0x01F1FB, 0x01F1EA, 0, + 0x01F1FB, 0x01F1EC, 0, + 0x01F1FB, 0x01F1EE, 0, + 0x01F1FB, 0x01F1F3, 0, + 0x01F1FB, 0x01F1FA, 0, + 0x01F1FC, 0x01F1EB, 0, + 0x01F1FC, 0x01F1F8, 0, + 0x01F1FD, 0x01F1F0, 0, + 0x01F1FE, 0x01F1EA, 0, + 0x01F1FE, 0x01F1F9, 0, + 0x01F1FF, 0x01F1E6, 0, + 0x01F1FF, 0x01F1F2, 0, + 0x01F1F0, 0x01F1FE, 0, + 0 // null-terminating the list +}; + +const base::uc32 UnicodePropertySequences::kEmojiTagSequences[] = { + 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0065, 0x0E006E, 0x0E0067, 0x0E007F, 0, + 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0073, 0x0E0063, 0x0E0074, 0x0E007F, 0, + 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0077, 0x0E006C, 0x0E0073, 0x0E007F, 0, + 0 // null-terminating the list +}; + +const base::uc32 UnicodePropertySequences::kEmojiZWJSequences[] = { + 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0, + 0x01F441, 0x00FE0F, 0x00200D, 0x01F5E8, 0x00FE0F, 0, + 0x01F468, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F467, 0, + 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, + 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0, + 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F468, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, + 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0, + 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, + 0x01F468, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, + 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0, + 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F469, 0, + 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D, + 0x01F468, 0, + 0x01F469, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D, + 0x01F469, 0, + 0x01F469, 0x00200D, 0x01F466, 0, + 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, + 0x01F469, 0x00200D, 0x01F467, 0, + 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, + 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, + 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0, + 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F466, 0x00200D, 0x01F466, 0, + 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0, + 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F466, 0, + 0x01F469, 0x00200D, 0x01F469, 0x00200D, 0x01F467, 0x00200D, 0x01F467, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FF, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FD, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FB, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FC, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FD, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F468, 0x01F3FE, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FB, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FC, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FD, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F469, 0x01F3FE, 0, + 0x01F9D1, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0, + 0x01F9D1, 0x01F3FB, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, + 0x01F9D1, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, + 0x01F9D1, 0x01F3FC, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, + 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, + 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, + 0x01F9D1, 0x01F3FD, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0, + 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, + 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, + 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0, + 0x01F9D1, 0x01F3FE, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FE, 0, + 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FB, 0, + 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FC, 0, + 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FD, 0, + 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FE, 0, + 0x01F9D1, 0x01F3FF, 0x00200D, 0x01F91D, 0x00200D, 0x01F9D1, 0x01F3FF, 0, + 0x01F468, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F468, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F468, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F468, 0x00200D, 0x01F33E, 0, + 0x01F468, 0x00200D, 0x01F373, 0, + 0x01F468, 0x00200D, 0x01F393, 0, + 0x01F468, 0x00200D, 0x01F3A4, 0, + 0x01F468, 0x00200D, 0x01F3A8, 0, + 0x01F468, 0x00200D, 0x01F3EB, 0, + 0x01F468, 0x00200D, 0x01F3ED, 0, + 0x01F468, 0x00200D, 0x01F4BB, 0, + 0x01F468, 0x00200D, 0x01F4BC, 0, + 0x01F468, 0x00200D, 0x01F527, 0, + 0x01F468, 0x00200D, 0x01F52C, 0, + 0x01F468, 0x00200D, 0x01F680, 0, + 0x01F468, 0x00200D, 0x01F692, 0, + 0x01F468, 0x00200D, 0x01F9AF, 0, + 0x01F468, 0x00200D, 0x01F9BC, 0, + 0x01F468, 0x00200D, 0x01F9BD, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F33E, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F373, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F393, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F3A4, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F3A8, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F3EB, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F3ED, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F4BB, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F4BC, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F527, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F52C, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F680, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F692, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9AF, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9BC, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9BD, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F33E, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F373, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F393, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F3A4, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F3A8, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F3EB, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F3ED, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F4BB, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F4BC, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F527, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F52C, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F680, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F692, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9AF, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9BC, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9BD, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F33E, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F373, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F393, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F3A4, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F3A8, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F3EB, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F3ED, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F4BB, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F4BC, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F527, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F52C, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F680, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F692, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9AF, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9BC, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9BD, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F33E, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F373, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F393, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F3A4, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F3A8, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F3EB, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F3ED, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F4BB, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F4BC, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F527, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F52C, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F680, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F692, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9AF, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9BC, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9BD, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F33E, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F373, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F393, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F3A4, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F3A8, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F3EB, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F3ED, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F4BB, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F4BC, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F527, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F52C, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F680, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F692, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9AF, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9BC, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9BD, 0, + 0x01F469, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F469, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F469, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F469, 0x00200D, 0x01F33E, 0, + 0x01F469, 0x00200D, 0x01F373, 0, + 0x01F469, 0x00200D, 0x01F393, 0, + 0x01F469, 0x00200D, 0x01F3A4, 0, + 0x01F469, 0x00200D, 0x01F3A8, 0, + 0x01F469, 0x00200D, 0x01F3EB, 0, + 0x01F469, 0x00200D, 0x01F3ED, 0, + 0x01F469, 0x00200D, 0x01F4BB, 0, + 0x01F469, 0x00200D, 0x01F4BC, 0, + 0x01F469, 0x00200D, 0x01F527, 0, + 0x01F469, 0x00200D, 0x01F52C, 0, + 0x01F469, 0x00200D, 0x01F680, 0, + 0x01F469, 0x00200D, 0x01F692, 0, + 0x01F469, 0x00200D, 0x01F9AF, 0, + 0x01F469, 0x00200D, 0x01F9BC, 0, + 0x01F469, 0x00200D, 0x01F9BD, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F33E, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F373, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F393, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F3A4, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F3A8, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F3EB, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F3ED, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F4BB, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F4BC, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F527, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F52C, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F680, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F692, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9AF, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9BC, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9BD, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F33E, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F373, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F393, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F3A4, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F3A8, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F3EB, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F3ED, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F4BB, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F4BC, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F527, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F52C, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F680, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F692, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9AF, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9BC, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9BD, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F33E, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F373, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F393, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F3A4, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F3A8, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F3EB, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F3ED, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F4BB, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F4BC, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F527, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F52C, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F680, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F692, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9AF, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9BC, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9BD, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F33E, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F373, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F393, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F3A4, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F3A8, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F3EB, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F3ED, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F4BB, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F4BC, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F527, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F52C, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F680, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F692, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9AF, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9BC, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9BD, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x002695, 0x00FE0F, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x002696, 0x00FE0F, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x002708, 0x00FE0F, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F33E, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F373, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F393, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F3A4, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F3A8, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F3EB, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F3ED, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F4BB, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F4BC, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F527, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F52C, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F680, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F692, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9AF, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9BC, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9BD, 0, + 0x0026F9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x0026F9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x0026F9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x0026F9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x0026F9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x0026F9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x0026F9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x0026F9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x0026F9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x0026F9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x0026F9, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x0026F9, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C3, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C3, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C3, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C4, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C4, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3C4, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CA, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CA, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CA, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CB, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CB, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CB, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CC, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F3CC, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F3CC, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46E, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46E, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F46F, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F46F, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F471, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F471, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F471, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F471, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F471, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F471, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F471, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F471, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F471, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F471, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F471, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F471, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F473, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F473, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F473, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F473, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F473, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F473, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F473, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F473, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F473, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F473, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F473, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F473, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F477, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F477, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F477, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F477, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F477, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F477, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F477, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F477, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F477, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F477, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F477, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F477, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F481, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F481, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F481, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F481, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F481, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F481, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F481, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F481, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F481, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F481, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F481, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F481, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F482, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F482, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F48B, 0x00200D, + 0x01F468, 0, + 0x01F482, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F482, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F482, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F482, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F482, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F482, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F482, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F482, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F482, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F486, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F486, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F486, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F486, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F486, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F486, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F486, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F486, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F486, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F486, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F486, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F486, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F487, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F487, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F487, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F487, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F487, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F487, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F487, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F487, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F487, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F487, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F487, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F487, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F575, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F575, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F575, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F575, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F575, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F575, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F575, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F575, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F575, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F575, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F575, 0x00FE0F, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F575, 0x00FE0F, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F645, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F645, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F645, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F645, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F645, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F645, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F645, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F645, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F645, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F645, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F645, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F645, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F646, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F646, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F646, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F646, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F646, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F646, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F646, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F646, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F646, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F646, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F646, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F646, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F647, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F647, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F647, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F647, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F647, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F647, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F647, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F647, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F647, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F647, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F647, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F647, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64B, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64B, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64B, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64B, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64B, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64B, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64B, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64B, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64B, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64B, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64B, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64B, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64D, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64D, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64D, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64D, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64D, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64D, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64D, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64D, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64D, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64D, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64D, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64D, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64E, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64E, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F64E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F64E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6A3, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6A3, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6A3, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B4, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B4, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B4, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B5, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B5, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B5, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B6, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B6, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F6B6, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F926, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F926, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F926, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F926, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F926, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F926, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F926, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F926, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F926, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F926, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F926, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F926, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F937, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F937, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F937, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F937, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F937, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F937, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F937, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F937, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F937, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F937, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F937, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F937, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F938, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F938, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F938, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F938, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F938, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F938, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F938, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F938, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F938, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F938, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F938, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F938, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F939, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F939, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F939, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F939, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F939, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F939, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F939, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F939, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F939, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F939, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F939, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F939, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93C, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93C, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93D, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93D, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93D, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93D, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93D, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93D, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93D, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93D, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93D, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93D, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93D, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93D, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93E, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93E, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93E, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93E, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93E, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93E, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93E, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93E, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93E, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93E, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F93E, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F93E, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B8, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B8, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B8, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B9, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B9, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9B9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CD, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CE, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9CF, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D6, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D6, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D6, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D7, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D7, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D7, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D8, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D8, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D8, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D9, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D9, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9D9, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DA, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DA, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DA, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DB, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DC, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FB, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FC, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FC, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FD, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FD, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DD, 0x01F3FF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DE, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DE, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F9DF, 0x00200D, 0x002640, 0x00FE0F, 0, + 0x01F9DF, 0x00200D, 0x002642, 0x00FE0F, 0, + 0x01F468, 0x00200D, 0x01F9B0, 0, + 0x01F468, 0x00200D, 0x01F9B1, 0, + 0x01F468, 0x00200D, 0x01F9B2, 0, + 0x01F468, 0x00200D, 0x01F9B3, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B0, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B1, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B2, 0, + 0x01F468, 0x01F3FB, 0x00200D, 0x01F9B3, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B0, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B1, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B2, 0, + 0x01F468, 0x01F3FC, 0x00200D, 0x01F9B3, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B0, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B1, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B2, 0, + 0x01F468, 0x01F3FD, 0x00200D, 0x01F9B3, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B0, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B1, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B2, 0, + 0x01F468, 0x01F3FE, 0x00200D, 0x01F9B3, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B0, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B1, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B2, 0, + 0x01F468, 0x01F3FF, 0x00200D, 0x01F9B3, 0, + 0x01F469, 0x00200D, 0x01F9B0, 0, + 0x01F469, 0x00200D, 0x01F9B1, 0, + 0x01F469, 0x00200D, 0x01F9B2, 0, + 0x01F469, 0x00200D, 0x01F9B3, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B0, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B1, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B2, 0, + 0x01F469, 0x01F3FB, 0x00200D, 0x01F9B3, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B0, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B1, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B2, 0, + 0x01F469, 0x01F3FC, 0x00200D, 0x01F9B3, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B0, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B1, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B2, 0, + 0x01F469, 0x01F3FD, 0x00200D, 0x01F9B3, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B0, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B1, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B2, 0, + 0x01F469, 0x01F3FE, 0x00200D, 0x01F9B3, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B0, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B1, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B2, 0, + 0x01F469, 0x01F3FF, 0x00200D, 0x01F9B3, 0, + 0x01F3F3, 0x00FE0F, 0x00200D, 0x01F308, 0, + 0x01F3F4, 0x00200D, 0x002620, 0x00FE0F, 0, + 0x01F415, 0x00200D, 0x01F9BA, 0, + 0x01F482, 0x01F3FB, 0x00200D, 0x002640, 0x00FE0F, 0, + 0 // null-terminating the list +}; +// clang-format on + +} // namespace internal +} // namespace v8 + +#endif // V8_INTL_SUPPORT diff --git a/js/src/irregexp/imported/property-sequences.h b/js/src/irregexp/imported/property-sequences.h new file mode 100644 index 0000000000..9b3a188865 --- /dev/null +++ b/js/src/irregexp/imported/property-sequences.h @@ -0,0 +1,27 @@ +// Copyright 2018 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_PROPERTY_SEQUENCES_H_ +#define V8_REGEXP_PROPERTY_SEQUENCES_H_ + +#ifdef V8_INTL_SUPPORT + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class UnicodePropertySequences : public AllStatic { + public: + static const base::uc32 kEmojiFlagSequences[]; + static const base::uc32 kEmojiTagSequences[]; + static const base::uc32 kEmojiZWJSequences[]; +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_INTL_SUPPORT + +#endif // V8_REGEXP_PROPERTY_SEQUENCES_H_ diff --git a/js/src/irregexp/imported/regexp-ast.cc b/js/src/irregexp/imported/regexp-ast.cc new file mode 100644 index 0000000000..63eeb5c05d --- /dev/null +++ b/js/src/irregexp/imported/regexp-ast.cc @@ -0,0 +1,432 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-ast.h" + + +namespace v8 { +namespace internal { + +#define MAKE_ACCEPT(Name) \ + void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \ + return visitor->Visit##Name(this, data); \ + } +FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT) +#undef MAKE_ACCEPT + +#define MAKE_TYPE_CASE(Name) \ + RegExp##Name* RegExpTree::As##Name() { return nullptr; } \ + bool RegExpTree::Is##Name() { return false; } +FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) +#undef MAKE_TYPE_CASE + +#define MAKE_TYPE_CASE(Name) \ + RegExp##Name* RegExp##Name::As##Name() { return this; } \ + bool RegExp##Name::Is##Name() { return true; } +FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) +#undef MAKE_TYPE_CASE + +namespace { + +Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) { + Interval result = Interval::Empty(); + for (int i = 0; i < children->length(); i++) + result = result.Union(children->at(i)->CaptureRegisters()); + return result; +} + +} // namespace + +Interval RegExpAlternative::CaptureRegisters() { + return ListCaptureRegisters(nodes()); +} + + +Interval RegExpDisjunction::CaptureRegisters() { + return ListCaptureRegisters(alternatives()); +} + + +Interval RegExpLookaround::CaptureRegisters() { + return body()->CaptureRegisters(); +} + + +Interval RegExpCapture::CaptureRegisters() { + Interval self(StartRegister(index()), EndRegister(index())); + return self.Union(body()->CaptureRegisters()); +} + + +Interval RegExpQuantifier::CaptureRegisters() { + return body()->CaptureRegisters(); +} + + +bool RegExpAssertion::IsAnchoredAtStart() { + return assertion_type() == RegExpAssertion::Type::START_OF_INPUT; +} + + +bool RegExpAssertion::IsAnchoredAtEnd() { + return assertion_type() == RegExpAssertion::Type::END_OF_INPUT; +} + + +bool RegExpAlternative::IsAnchoredAtStart() { + ZoneList<RegExpTree*>* nodes = this->nodes(); + for (int i = 0; i < nodes->length(); i++) { + RegExpTree* node = nodes->at(i); + if (node->IsAnchoredAtStart()) { + return true; + } + if (node->max_match() > 0) { + return false; + } + } + return false; +} + + +bool RegExpAlternative::IsAnchoredAtEnd() { + ZoneList<RegExpTree*>* nodes = this->nodes(); + for (int i = nodes->length() - 1; i >= 0; i--) { + RegExpTree* node = nodes->at(i); + if (node->IsAnchoredAtEnd()) { + return true; + } + if (node->max_match() > 0) { + return false; + } + } + return false; +} + + +bool RegExpDisjunction::IsAnchoredAtStart() { + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + for (int i = 0; i < alternatives->length(); i++) { + if (!alternatives->at(i)->IsAnchoredAtStart()) return false; + } + return true; +} + + +bool RegExpDisjunction::IsAnchoredAtEnd() { + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + for (int i = 0; i < alternatives->length(); i++) { + if (!alternatives->at(i)->IsAnchoredAtEnd()) return false; + } + return true; +} + + +bool RegExpLookaround::IsAnchoredAtStart() { + return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart(); +} + + +bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); } + + +bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); } + +namespace { + +// Convert regular expression trees to a simple sexp representation. +// This representation should be different from the input grammar +// in as many cases as possible, to make it more difficult for incorrect +// parses to look as correct ones which is likely if the input and +// output formats are alike. +class RegExpUnparser final : public RegExpVisitor { + public: + RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {} + void VisitCharacterRange(CharacterRange that); +#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override; + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) +#undef MAKE_CASE + private: + std::ostream& os_; + Zone* zone_; +}; + +} // namespace + +void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) { + os_ << "(|"; + for (int i = 0; i < that->alternatives()->length(); i++) { + os_ << " "; + that->alternatives()->at(i)->Accept(this, data); + } + os_ << ")"; + return nullptr; +} + + +void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) { + os_ << "(:"; + for (int i = 0; i < that->nodes()->length(); i++) { + os_ << " "; + that->nodes()->at(i)->Accept(this, data); + } + os_ << ")"; + return nullptr; +} + + +void RegExpUnparser::VisitCharacterRange(CharacterRange that) { + os_ << AsUC32(that.from()); + if (!that.IsSingleton()) { + os_ << "-" << AsUC32(that.to()); + } +} + +void* RegExpUnparser::VisitClassRanges(RegExpClassRanges* that, void* data) { + if (that->is_negated()) os_ << "^"; + os_ << "["; + for (int i = 0; i < that->ranges(zone_)->length(); i++) { + if (i > 0) os_ << " "; + VisitCharacterRange(that->ranges(zone_)->at(i)); + } + os_ << "]"; + return nullptr; +} + +void* RegExpUnparser::VisitClassSetOperand(RegExpClassSetOperand* that, + void* data) { + os_ << "!["; + for (int i = 0; i < that->ranges()->length(); i++) { + if (i > 0) os_ << " "; + VisitCharacterRange(that->ranges()->at(i)); + } + if (that->has_strings()) { + for (auto iter : *that->strings()) { + os_ << " '"; + os_ << std::string(iter.first.begin(), iter.first.end()); + os_ << "'"; + } + } + os_ << "]"; + return nullptr; +} + +void* RegExpUnparser::VisitClassSetExpression(RegExpClassSetExpression* that, + void* data) { + switch (that->operation()) { + case RegExpClassSetExpression::OperationType::kUnion: + os_ << "++"; + break; + case RegExpClassSetExpression::OperationType::kIntersection: + os_ << "&&"; + break; + case RegExpClassSetExpression::OperationType::kSubtraction: + os_ << "--"; + break; + } + if (that->is_negated()) os_ << "^"; + os_ << "["; + for (int i = 0; i < that->operands()->length(); i++) { + if (i > 0) os_ << " "; + that->operands()->at(i)->Accept(this, data); + } + os_ << "]"; + return nullptr; +} + +void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) { + switch (that->assertion_type()) { + case RegExpAssertion::Type::START_OF_INPUT: + os_ << "@^i"; + break; + case RegExpAssertion::Type::END_OF_INPUT: + os_ << "@$i"; + break; + case RegExpAssertion::Type::START_OF_LINE: + os_ << "@^l"; + break; + case RegExpAssertion::Type::END_OF_LINE: + os_ << "@$l"; + break; + case RegExpAssertion::Type::BOUNDARY: + os_ << "@b"; + break; + case RegExpAssertion::Type::NON_BOUNDARY: + os_ << "@B"; + break; + } + return nullptr; +} + + +void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) { + os_ << "'"; + base::Vector<const base::uc16> chardata = that->data(); + for (int i = 0; i < chardata.length(); i++) { + os_ << AsUC16(chardata[i]); + } + os_ << "'"; + return nullptr; +} + + +void* RegExpUnparser::VisitText(RegExpText* that, void* data) { + if (that->elements()->length() == 1) { + that->elements()->at(0).tree()->Accept(this, data); + } else { + os_ << "(!"; + for (int i = 0; i < that->elements()->length(); i++) { + os_ << " "; + that->elements()->at(i).tree()->Accept(this, data); + } + os_ << ")"; + } + return nullptr; +} + + +void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) { + os_ << "(# " << that->min() << " "; + if (that->max() == RegExpTree::kInfinity) { + os_ << "- "; + } else { + os_ << that->max() << " "; + } + os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n "); + that->body()->Accept(this, data); + os_ << ")"; + return nullptr; +} + + +void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) { + os_ << "(^ "; + that->body()->Accept(this, data); + os_ << ")"; + return nullptr; +} + +void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) { + os_ << "(?: "; + that->body()->Accept(this, data); + os_ << ")"; + return nullptr; +} + +void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) { + os_ << "("; + os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-"); + os_ << (that->is_positive() ? " + " : " - "); + that->body()->Accept(this, data); + os_ << ")"; + return nullptr; +} + + +void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, + void* data) { + os_ << "(<- " << that->index() << ")"; + return nullptr; +} + + +void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) { + os_ << '%'; + return nullptr; +} + +std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { + RegExpUnparser unparser(os, zone); + Accept(&unparser, nullptr); + return os; +} + +RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives) + : alternatives_(alternatives) { + DCHECK_LT(1, alternatives->length()); + RegExpTree* first_alternative = alternatives->at(0); + min_match_ = first_alternative->min_match(); + max_match_ = first_alternative->max_match(); + for (int i = 1; i < alternatives->length(); i++) { + RegExpTree* alternative = alternatives->at(i); + min_match_ = std::min(min_match_, alternative->min_match()); + max_match_ = std::max(max_match_, alternative->max_match()); + } +} + +namespace { + +int IncreaseBy(int previous, int increase) { + if (RegExpTree::kInfinity - previous < increase) { + return RegExpTree::kInfinity; + } else { + return previous + increase; + } +} + +} // namespace + +RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes) + : nodes_(nodes) { + DCHECK_LT(1, nodes->length()); + min_match_ = 0; + max_match_ = 0; + for (int i = 0; i < nodes->length(); i++) { + RegExpTree* node = nodes->at(i); + int node_min_match = node->min_match(); + min_match_ = IncreaseBy(min_match_, node_min_match); + int node_max_match = node->max_match(); + max_match_ = IncreaseBy(max_match_, node_max_match); + } +} + +RegExpClassSetOperand::RegExpClassSetOperand(ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings) + : ranges_(ranges), strings_(strings) { + DCHECK_NOT_NULL(ranges); + min_match_ = 0; + max_match_ = 0; + if (!ranges->is_empty()) { + min_match_ = 1; + max_match_ = 2; + } + if (has_strings()) { + for (auto string : *strings) { + min_match_ = std::min(min_match_, string.second->min_match()); + max_match_ = std::max(max_match_, string.second->max_match()); + } + } +} + +RegExpClassSetExpression::RegExpClassSetExpression( + OperationType op, bool is_negated, bool may_contain_strings, + ZoneList<RegExpTree*>* operands) + : operation_(op), + is_negated_(is_negated), + may_contain_strings_(may_contain_strings), + operands_(operands) { + DCHECK_NOT_NULL(operands); + DCHECK_IMPLIES(is_negated_, !may_contain_strings_); + max_match_ = 0; + for (auto op : *operands) { + max_match_ = std::max(max_match_, op->max_match()); + } +} + +// static +RegExpClassSetExpression* RegExpClassSetExpression::Empty(Zone* zone, + bool is_negated) { + ZoneList<CharacterRange>* ranges = + zone->template New<ZoneList<CharacterRange>>(0, zone); + RegExpClassSetOperand* op = + zone->template New<RegExpClassSetOperand>(ranges, nullptr); + ZoneList<RegExpTree*>* operands = + zone->template New<ZoneList<RegExpTree*>>(1, zone); + operands->Add(op, zone); + return zone->template New<RegExpClassSetExpression>( + RegExpClassSetExpression::OperationType::kUnion, is_negated, false, + operands); +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-ast.h b/js/src/irregexp/imported/regexp-ast.h new file mode 100644 index 0000000000..997282e519 --- /dev/null +++ b/js/src/irregexp/imported/regexp-ast.h @@ -0,0 +1,735 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_AST_H_ +#define V8_REGEXP_REGEXP_AST_H_ + +#ifdef V8_INTL_SUPPORT +#include "unicode/uniset.h" +#endif // V8_INTL_SUPPORT +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ + VISIT(Disjunction) \ + VISIT(Alternative) \ + VISIT(Assertion) \ + VISIT(ClassRanges) \ + VISIT(ClassSetOperand) \ + VISIT(ClassSetExpression) \ + VISIT(Atom) \ + VISIT(Quantifier) \ + VISIT(Capture) \ + VISIT(Group) \ + VISIT(Lookaround) \ + VISIT(BackReference) \ + VISIT(Empty) \ + VISIT(Text) + +#define FORWARD_DECLARE(Name) class RegExp##Name; +FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) +#undef FORWARD_DECLARE + +class RegExpCompiler; +class RegExpNode; +class RegExpTree; + +class RegExpVisitor { + public: + virtual ~RegExpVisitor() = default; +#define MAKE_CASE(Name) \ + virtual void* Visit##Name(RegExp##Name*, void* data) = 0; + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) +#undef MAKE_CASE +}; + +// A simple closed interval. +class Interval { + public: + Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). + Interval(int from, int to) : from_(from), to_(to) {} + Interval Union(Interval that) { + if (that.from_ == kNone) return *this; + if (from_ == kNone) return that; + return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); + } + + static Interval Empty() { return Interval(); } + + bool Contains(int value) const { return (from_ <= value) && (value <= to_); } + bool is_empty() const { return from_ == kNone; } + int from() const { return from_; } + int to() const { return to_; } + int size() const { return to_ - from_ + 1; } + + static constexpr int kNone = -1; + + private: + int from_; + int to_; +}; + +// Named standard character sets. +enum class StandardCharacterSet : char { + kWhitespace = 's', // Like /\s/. + kNotWhitespace = 'S', // Like /\S/. + kWord = 'w', // Like /\w/. + kNotWord = 'W', // Like /\W/. + kDigit = 'd', // Like /\d/. + kNotDigit = 'D', // Like /\D/. + kLineTerminator = 'n', // The inverse of /./. + kNotLineTerminator = '.', // Like /./. + kEverything = '*', // Matches every character, like /./s. +}; + +// Represents code points (with values up to 0x10FFFF) in the range from from_ +// to to_, both ends are inclusive. +class CharacterRange { + public: + CharacterRange() = default; + // For compatibility with the CHECK_OK macro. + CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT + + static inline CharacterRange Singleton(base::uc32 value) { + return CharacterRange(value, value); + } + static inline CharacterRange Range(base::uc32 from, base::uc32 to) { + DCHECK(0 <= from && to <= kMaxCodePoint); + DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); + return CharacterRange(from, to); + } + static inline CharacterRange Everything() { + return CharacterRange(0, kMaxCodePoint); + } + + static inline ZoneList<CharacterRange>* List(Zone* zone, + CharacterRange range) { + ZoneList<CharacterRange>* list = + zone->New<ZoneList<CharacterRange>>(1, zone); + list->Add(range, zone); + return list; + } + + // Add class escapes. Add case equivalent closure for \w and \W if necessary. + V8_EXPORT_PRIVATE static void AddClassEscape( + StandardCharacterSet standard_character_set, + ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents, + Zone* zone); + // Add case equivalents to ranges. Only used for /i, not for /ui or /vi, as + // the semantics for unicode mode are slightly different. + // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4. + V8_EXPORT_PRIVATE static void AddCaseEquivalents( + Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, + bool is_one_byte); + // Add case equivalent code points to ranges. Only used for /ui and /vi, not + // for /i, as the semantics for non-unicode mode are slightly different. + // See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch Note 4. + static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, + Zone* zone); + +#ifdef V8_INTL_SUPPORT + // Creates the closeOver of the given UnicodeSet, removing all + // characters/strings that can't be derived via simple case folding. + static void UnicodeSimpleCloseOver(icu::UnicodeSet& set); +#endif // V8_INTL_SUPPORT + + bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } + base::uc32 from() const { return from_; } + base::uc32 to() const { return to_; } + bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; } + bool IsSingleton() const { return from_ == to_; } + + // Whether a range list is in canonical form: Ranges ordered by from value, + // and ranges non-overlapping and non-adjacent. + V8_EXPORT_PRIVATE static bool IsCanonical( + const ZoneList<CharacterRange>* ranges); + // Convert range list to canonical form. The characters covered by the ranges + // will still be the same, but no character is in more than one range, and + // adjacent ranges are merged. The resulting list may be shorter than the + // original, but cannot be longer. + static void Canonicalize(ZoneList<CharacterRange>* ranges); + // Negate the contents of a character range in canonical form. + static void Negate(const ZoneList<CharacterRange>* src, + ZoneList<CharacterRange>* dst, Zone* zone); + // Intersect the contents of two character ranges in canonical form. + static void Intersect(const ZoneList<CharacterRange>* lhs, + const ZoneList<CharacterRange>* rhs, + ZoneList<CharacterRange>* dst, Zone* zone); + // Subtract the contents of |to_remove| from the contents of |src|. + static void Subtract(const ZoneList<CharacterRange>* src, + const ZoneList<CharacterRange>* to_remove, + ZoneList<CharacterRange>* dst, Zone* zone); + // Remove all ranges outside the one-byte range. + static void ClampToOneByte(ZoneList<CharacterRange>* ranges); + // Checks if two ranges (both need to be canonical) are equal. + static bool Equals(const ZoneList<CharacterRange>* lhs, + const ZoneList<CharacterRange>* rhs); + + private: + CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {} + + static constexpr int kMaxCodePoint = 0x10ffff; + + base::uc32 from_ = 0; + base::uc32 to_ = 0; +}; + +inline bool operator==(const CharacterRange& lhs, const CharacterRange& rhs) { + return lhs.from() == rhs.from() && lhs.to() == rhs.to(); +} +inline bool operator!=(const CharacterRange& lhs, const CharacterRange& rhs) { + return !operator==(lhs, rhs); +} + +#define DECL_BOILERPLATE(Name) \ + void* Accept(RegExpVisitor* visitor, void* data) override; \ + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \ + override; \ + RegExp##Name* As##Name() override; \ + bool Is##Name() override + +class RegExpTree : public ZoneObject { + public: + static const int kInfinity = kMaxInt; + virtual ~RegExpTree() = default; + virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) = 0; + virtual bool IsTextElement() { return false; } + virtual bool IsAnchoredAtStart() { return false; } + virtual bool IsAnchoredAtEnd() { return false; } + virtual int min_match() = 0; + virtual int max_match() = 0; + // Returns the interval of registers used for captures within this + // expression. + virtual Interval CaptureRegisters() { return Interval::Empty(); } + virtual void AppendToText(RegExpText* text, Zone* zone); + V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); +#define MAKE_ASTYPE(Name) \ + virtual RegExp##Name* As##Name(); \ + virtual bool Is##Name(); + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) +#undef MAKE_ASTYPE +}; + + +class RegExpDisjunction final : public RegExpTree { + public: + explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); + + DECL_BOILERPLATE(Disjunction); + + Interval CaptureRegisters() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList<RegExpTree*>* alternatives() const { return alternatives_; } + + private: + bool SortConsecutiveAtoms(RegExpCompiler* compiler); + void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); + void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); + ZoneList<RegExpTree*>* alternatives_; + int min_match_; + int max_match_; +}; + + +class RegExpAlternative final : public RegExpTree { + public: + explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); + + DECL_BOILERPLATE(Alternative); + + Interval CaptureRegisters() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList<RegExpTree*>* nodes() const { return nodes_; } + + private: + ZoneList<RegExpTree*>* nodes_; + int min_match_; + int max_match_; +}; + + +class RegExpAssertion final : public RegExpTree { + public: + enum class Type { + START_OF_LINE = 0, + START_OF_INPUT = 1, + END_OF_LINE = 2, + END_OF_INPUT = 3, + BOUNDARY = 4, + NON_BOUNDARY = 5, + LAST_ASSERTION_TYPE = NON_BOUNDARY, + }; + explicit RegExpAssertion(Type type) : assertion_type_(type) {} + + DECL_BOILERPLATE(Assertion); + + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + Type assertion_type() const { return assertion_type_; } + + private: + const Type assertion_type_; +}; + +class CharacterSet final { + public: + explicit CharacterSet(StandardCharacterSet standard_set_type) + : standard_set_type_(standard_set_type) {} + explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {} + + ZoneList<CharacterRange>* ranges(Zone* zone); + StandardCharacterSet standard_set_type() const { + return standard_set_type_.value(); + } + void set_standard_set_type(StandardCharacterSet standard_set_type) { + standard_set_type_ = standard_set_type; + } + bool is_standard() const { return standard_set_type_.has_value(); } + V8_EXPORT_PRIVATE void Canonicalize(); + + private: + ZoneList<CharacterRange>* ranges_ = nullptr; + base::Optional<StandardCharacterSet> standard_set_type_; +}; + +class RegExpClassRanges final : public RegExpTree { + public: + // NEGATED: The character class is negated and should match everything but + // the specified ranges. + // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split + // surrogate and should not be unicode-desugared (crbug.com/641091). + enum Flag { + NEGATED = 1 << 0, + CONTAINS_SPLIT_SURROGATE = 1 << 1, + }; + using ClassRangesFlags = base::Flags<Flag>; + + RegExpClassRanges(Zone* zone, ZoneList<CharacterRange>* ranges, + ClassRangesFlags class_ranges_flags = ClassRangesFlags()) + : set_(ranges), class_ranges_flags_(class_ranges_flags) { + // Convert the empty set of ranges to the negated Everything() range. + if (ranges->is_empty()) { + ranges->Add(CharacterRange::Everything(), zone); + class_ranges_flags_ ^= NEGATED; + } + } + explicit RegExpClassRanges(StandardCharacterSet standard_set_type) + : set_(standard_set_type), class_ranges_flags_() {} + + DECL_BOILERPLATE(ClassRanges); + + bool IsTextElement() override { return true; } + int min_match() override { return 1; } + // The character class may match two code units for unicode regexps. + // TODO(yangguo): we should split this class for usage in TextElement, and + // make max_match() dependent on the character class content. + int max_match() override { return 2; } + + void AppendToText(RegExpText* text, Zone* zone) override; + + // TODO(lrn): Remove need for complex version if is_standard that + // recognizes a mangled standard set and just do { return set_.is_special(); } + bool is_standard(Zone* zone); + // Returns a value representing the standard character set if is_standard() + // returns true. + StandardCharacterSet standard_type() const { + return set_.standard_set_type(); + } + + CharacterSet character_set() const { return set_; } + ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } + + bool is_negated() const { return (class_ranges_flags_ & NEGATED) != 0; } + bool contains_split_surrogate() const { + return (class_ranges_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; + } + + private: + CharacterSet set_; + ClassRangesFlags class_ranges_flags_; +}; + +struct CharacterClassStringLess { + bool operator()(const base::Vector<const base::uc32>& lhs, + const base::Vector<const base::uc32>& rhs) const { + // Longer strings first so we generate matches for the largest string + // possible. + if (lhs.length() != rhs.length()) { + return lhs.length() > rhs.length(); + } + for (int i = 0; i < lhs.length(); i++) { + if (lhs[i] != rhs[i]) { + return lhs[i] < rhs[i]; + } + } + return false; + } +}; + +// A type used for strings as part of character classes (only possible in +// unicode sets mode). +// We use a ZoneMap instead of an UnorderedZoneMap because we need to match +// the longest alternatives first. By using a ZoneMap with the custom comparator +// we can avoid sorting before assembling the code. +// Strings are likely short (the largest string in current unicode properties +// consists of 10 code points). +using CharacterClassStrings = ZoneMap<base::Vector<const base::uc32>, + RegExpTree*, CharacterClassStringLess>; + +// TODO(pthier): If we are sure we don't want to use icu::UnicodeSets +// (performance evaluation pending), this class can be merged with +// RegExpClassRanges. +class RegExpClassSetOperand final : public RegExpTree { + public: + RegExpClassSetOperand(ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings); + + DECL_BOILERPLATE(ClassSetOperand); + + bool IsTextElement() override { return true; } + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + + void Union(RegExpClassSetOperand* other, Zone* zone); + void Intersect(RegExpClassSetOperand* other, + ZoneList<CharacterRange>* temp_ranges, Zone* zone); + void Subtract(RegExpClassSetOperand* other, + ZoneList<CharacterRange>* temp_ranges, Zone* zone); + + bool has_strings() const { return strings_ != nullptr && !strings_->empty(); } + ZoneList<CharacterRange>* ranges() { return ranges_; } + CharacterClassStrings* strings() { + DCHECK_NOT_NULL(strings_); + return strings_; + } + + private: + ZoneList<CharacterRange>* ranges_; + CharacterClassStrings* strings_; + int min_match_; + int max_match_; +}; + +class RegExpClassSetExpression final : public RegExpTree { + public: + enum class OperationType { kUnion, kIntersection, kSubtraction }; + + RegExpClassSetExpression(OperationType op, bool is_negated, + bool may_contain_strings, + ZoneList<RegExpTree*>* operands); + + DECL_BOILERPLATE(ClassSetExpression); + + // Create an empty class set expression (matches everything if |is_negated|, + // nothing otherwise). + static RegExpClassSetExpression* Empty(Zone* zone, bool is_negated); + + bool IsTextElement() override { return true; } + int min_match() override { return 0; } + int max_match() override { return max_match_; } + + OperationType operation() const { return operation_; } + bool is_negated() const { return is_negated_; } + bool may_contain_strings() const { return may_contain_strings_; } + const ZoneList<RegExpTree*>* operands() const { return operands_; } + ZoneList<RegExpTree*>* operands() { return operands_; } + + private: + // Recursively evaluates the tree rooted at |root|, computing the valid + // CharacterRanges and strings after applying all set operations. + // The original tree will be modified by this method, so don't store pointers + // to inner nodes of the tree somewhere else! + // Modifying the tree in-place saves memory and speeds up multiple calls of + // the method (e.g. when unrolling quantifiers). + // |temp_ranges| is used for intermediate results, passed as parameter to + // avoid allocating new lists all the time. + static RegExpClassSetOperand* ComputeExpression( + RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone); + + const OperationType operation_; + const bool is_negated_; + const bool may_contain_strings_; + ZoneList<RegExpTree*>* operands_ = nullptr; + int max_match_; +}; + +class RegExpAtom final : public RegExpTree { + public: + explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {} + + DECL_BOILERPLATE(Atom); + + bool IsTextElement() override { return true; } + int min_match() override { return data_.length(); } + int max_match() override { return data_.length(); } + void AppendToText(RegExpText* text, Zone* zone) override; + + base::Vector<const base::uc16> data() const { return data_; } + int length() const { return data_.length(); } + + private: + base::Vector<const base::uc16> data_; +}; + +class TextElement final { + public: + enum TextType { ATOM, CLASS_RANGES }; + + static TextElement Atom(RegExpAtom* atom); + static TextElement ClassRanges(RegExpClassRanges* class_ranges); + + int cp_offset() const { return cp_offset_; } + void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } + int length() const; + + TextType text_type() const { return text_type_; } + + RegExpTree* tree() const { return tree_; } + + RegExpAtom* atom() const { + DCHECK(text_type() == ATOM); + return reinterpret_cast<RegExpAtom*>(tree()); + } + + RegExpClassRanges* class_ranges() const { + DCHECK(text_type() == CLASS_RANGES); + return reinterpret_cast<RegExpClassRanges*>(tree()); + } + + private: + TextElement(TextType text_type, RegExpTree* tree) + : cp_offset_(-1), text_type_(text_type), tree_(tree) {} + + int cp_offset_; + TextType text_type_; + RegExpTree* tree_; +}; + +class RegExpText final : public RegExpTree { + public: + explicit RegExpText(Zone* zone) : elements_(2, zone) {} + + DECL_BOILERPLATE(Text); + + bool IsTextElement() override { return true; } + int min_match() override { return length_; } + int max_match() override { return length_; } + void AppendToText(RegExpText* text, Zone* zone) override; + void AddElement(TextElement elm, Zone* zone) { + elements_.Add(elm, zone); + length_ += elm.length(); + } + ZoneList<TextElement>* elements() { return &elements_; } + + private: + ZoneList<TextElement> elements_; + int length_ = 0; +}; + + +class RegExpQuantifier final : public RegExpTree { + public: + enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; + RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) + : body_(body), + min_(min), + max_(max), + quantifier_type_(type) { + if (min > 0 && body->min_match() > kInfinity / min) { + min_match_ = kInfinity; + } else { + min_match_ = min * body->min_match(); + } + if (max > 0 && body->max_match() > kInfinity / max) { + max_match_ = kInfinity; + } else { + max_match_ = max * body->max_match(); + } + } + + DECL_BOILERPLATE(Quantifier); + + static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, + RegExpCompiler* compiler, RegExpNode* on_success, + bool not_at_start = false); + Interval CaptureRegisters() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + int min() const { return min_; } + int max() const { return max_; } + QuantifierType quantifier_type() const { return quantifier_type_; } + bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } + bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } + bool is_greedy() const { return quantifier_type_ == GREEDY; } + RegExpTree* body() const { return body_; } + + private: + RegExpTree* body_; + int min_; + int max_; + int min_match_; + int max_match_; + QuantifierType quantifier_type_; +}; + + +class RegExpCapture final : public RegExpTree { + public: + explicit RegExpCapture(int index) + : body_(nullptr), + index_(index), + min_match_(0), + max_match_(0), + name_(nullptr) {} + + DECL_BOILERPLATE(Capture); + + static RegExpNode* ToNode(RegExpTree* body, int index, + RegExpCompiler* compiler, RegExpNode* on_success); + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + Interval CaptureRegisters() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + RegExpTree* body() { return body_; } + void set_body(RegExpTree* body) { + body_ = body; + min_match_ = body->min_match(); + max_match_ = body->max_match(); + } + int index() const { return index_; } + const ZoneVector<base::uc16>* name() const { return name_; } + void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } + static int StartRegister(int index) { return index * 2; } + static int EndRegister(int index) { return index * 2 + 1; } + + private: + RegExpTree* body_ = nullptr; + int index_; + int min_match_ = 0; + int max_match_ = 0; + const ZoneVector<base::uc16>* name_ = nullptr; +}; + +class RegExpGroup final : public RegExpTree { + public: + explicit RegExpGroup(RegExpTree* body) + : body_(body), + min_match_(body->min_match()), + max_match_(body->max_match()) {} + + DECL_BOILERPLATE(Group); + + bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } + bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + Interval CaptureRegisters() override { return body_->CaptureRegisters(); } + RegExpTree* body() const { return body_; } + + private: + RegExpTree* body_; + int min_match_; + int max_match_; +}; + +class RegExpLookaround final : public RegExpTree { + public: + enum Type { LOOKAHEAD, LOOKBEHIND }; + + RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, + int capture_from, Type type) + : body_(body), + is_positive_(is_positive), + capture_count_(capture_count), + capture_from_(capture_from), + type_(type) {} + + DECL_BOILERPLATE(Lookaround); + + Interval CaptureRegisters() override; + bool IsAnchoredAtStart() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + RegExpTree* body() const { return body_; } + bool is_positive() const { return is_positive_; } + int capture_count() const { return capture_count_; } + int capture_from() const { return capture_from_; } + Type type() const { return type_; } + + class Builder { + public: + Builder(bool is_positive, RegExpNode* on_success, + int stack_pointer_register, int position_register, + int capture_register_count = 0, int capture_register_start = 0); + RegExpNode* on_match_success() const { return on_match_success_; } + RegExpNode* ForMatch(RegExpNode* match); + + private: + bool is_positive_; + RegExpNode* on_match_success_; + RegExpNode* on_success_; + int stack_pointer_register_; + int position_register_; + }; + + private: + RegExpTree* body_; + bool is_positive_; + int capture_count_; + int capture_from_; + Type type_; +}; + + +class RegExpBackReference final : public RegExpTree { + public: + explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {} + RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) + : capture_(capture), flags_(flags) {} + + DECL_BOILERPLATE(BackReference); + + int min_match() override { return 0; } + // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite + // recursion, we give up. Ignorance is bliss. + int max_match() override { return kInfinity; } + int index() const { return capture_->index(); } + RegExpCapture* capture() const { return capture_; } + void set_capture(RegExpCapture* capture) { capture_ = capture; } + const ZoneVector<base::uc16>* name() const { return name_; } + void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } + + private: + RegExpCapture* capture_ = nullptr; + const ZoneVector<base::uc16>* name_ = nullptr; + const RegExpFlags flags_; +}; + + +class RegExpEmpty final : public RegExpTree { + public: + DECL_BOILERPLATE(Empty); + int min_match() override { return 0; } + int max_match() override { return 0; } +}; + +} // namespace internal +} // namespace v8 + +#undef DECL_BOILERPLATE + +#endif // V8_REGEXP_REGEXP_AST_H_ diff --git a/js/src/irregexp/imported/regexp-bytecode-generator-inl.h b/js/src/irregexp/imported/regexp-bytecode-generator-inl.h new file mode 100644 index 0000000000..807ca66f47 --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecode-generator-inl.h @@ -0,0 +1,55 @@ +// Copyright 2008-2009 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_ +#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_ + +#include "irregexp/imported/regexp-bytecode-generator.h" + +#include "irregexp/imported/regexp-bytecodes.h" + +namespace v8 { +namespace internal { + +void RegExpBytecodeGenerator::Emit(uint32_t byte, uint32_t twenty_four_bits) { + DCHECK(is_uint24(twenty_four_bits)); + Emit32((twenty_four_bits << BYTECODE_SHIFT) | byte); +} + +void RegExpBytecodeGenerator::Emit(uint32_t byte, int32_t twenty_four_bits) { + DCHECK(is_int24(twenty_four_bits)); + Emit32((static_cast<uint32_t>(twenty_four_bits) << BYTECODE_SHIFT) | byte); +} + +void RegExpBytecodeGenerator::Emit16(uint32_t word) { + DCHECK(pc_ <= static_cast<int>(buffer_.size())); + if (pc_ + 1 >= static_cast<int>(buffer_.size())) { + ExpandBuffer(); + } + *reinterpret_cast<uint16_t*>(buffer_.data() + pc_) = word; + pc_ += 2; +} + +void RegExpBytecodeGenerator::Emit8(uint32_t word) { + DCHECK(pc_ <= static_cast<int>(buffer_.size())); + if (pc_ == static_cast<int>(buffer_.size())) { + ExpandBuffer(); + } + *reinterpret_cast<unsigned char*>(buffer_.data() + pc_) = word; + pc_ += 1; +} + +void RegExpBytecodeGenerator::Emit32(uint32_t word) { + DCHECK(pc_ <= static_cast<int>(buffer_.size())); + if (pc_ + 3 >= static_cast<int>(buffer_.size())) { + ExpandBuffer(); + } + *reinterpret_cast<uint32_t*>(buffer_.data() + pc_) = word; + pc_ += 4; +} + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_ diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.cc b/js/src/irregexp/imported/regexp-bytecode-generator.cc new file mode 100644 index 0000000000..934a39130d --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecode-generator.cc @@ -0,0 +1,405 @@ +// Copyright 2008-2009 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-bytecode-generator.h" + +#include "irregexp/imported/regexp-bytecode-generator-inl.h" +#include "irregexp/imported/regexp-bytecode-peephole.h" +#include "irregexp/imported/regexp-bytecodes.h" +#include "irregexp/imported/regexp-macro-assembler.h" + +namespace v8 { +namespace internal { + +RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone) + : RegExpMacroAssembler(isolate, zone), + buffer_(kInitialBufferSize, zone), + pc_(0), + advance_current_end_(kInvalidPC), + jump_edges_(zone), + isolate_(isolate) {} + +RegExpBytecodeGenerator::~RegExpBytecodeGenerator() { + if (backtrack_.is_linked()) backtrack_.Unuse(); +} + +RegExpBytecodeGenerator::IrregexpImplementation +RegExpBytecodeGenerator::Implementation() { + return kBytecodeImplementation; +} + +void RegExpBytecodeGenerator::Bind(Label* l) { + advance_current_end_ = kInvalidPC; + DCHECK(!l->is_bound()); + if (l->is_linked()) { + int pos = l->pos(); + while (pos != 0) { + int fixup = pos; + pos = *reinterpret_cast<int32_t*>(buffer_.data() + fixup); + *reinterpret_cast<uint32_t*>(buffer_.data() + fixup) = pc_; + jump_edges_.emplace(fixup, pc_); + } + } + l->bind_to(pc_); +} + +void RegExpBytecodeGenerator::EmitOrLink(Label* l) { + if (l == nullptr) l = &backtrack_; + int pos = 0; + if (l->is_bound()) { + pos = l->pos(); + jump_edges_.emplace(pc_, pos); + } else { + if (l->is_linked()) { + pos = l->pos(); + } + l->link_to(pc_); + } + Emit32(pos); +} + +void RegExpBytecodeGenerator::PopRegister(int register_index) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_POP_REGISTER, register_index); +} + +void RegExpBytecodeGenerator::PushRegister(int register_index, + StackCheckFlag check_stack_limit) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_PUSH_REGISTER, register_index); +} + +void RegExpBytecodeGenerator::WriteCurrentPositionToRegister(int register_index, + int cp_offset) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_SET_REGISTER_TO_CP, register_index); + Emit32(cp_offset); // Current position offset. +} + +void RegExpBytecodeGenerator::ClearRegisters(int reg_from, int reg_to) { + DCHECK(reg_from <= reg_to); + for (int reg = reg_from; reg <= reg_to; reg++) { + SetRegister(reg, -1); + } +} + +void RegExpBytecodeGenerator::ReadCurrentPositionFromRegister( + int register_index) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_SET_CP_TO_REGISTER, register_index); +} + +void RegExpBytecodeGenerator::WriteStackPointerToRegister(int register_index) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_SET_REGISTER_TO_SP, register_index); +} + +void RegExpBytecodeGenerator::ReadStackPointerFromRegister(int register_index) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_SET_SP_TO_REGISTER, register_index); +} + +void RegExpBytecodeGenerator::SetCurrentPositionFromEnd(int by) { + DCHECK(is_uint24(by)); + Emit(BC_SET_CURRENT_POSITION_FROM_END, by); +} + +void RegExpBytecodeGenerator::SetRegister(int register_index, int to) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_SET_REGISTER, register_index); + Emit32(to); +} + +void RegExpBytecodeGenerator::AdvanceRegister(int register_index, int by) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_ADVANCE_REGISTER, register_index); + Emit32(by); +} + +void RegExpBytecodeGenerator::PopCurrentPosition() { Emit(BC_POP_CP, 0); } + +void RegExpBytecodeGenerator::PushCurrentPosition() { Emit(BC_PUSH_CP, 0); } + +void RegExpBytecodeGenerator::Backtrack() { + int error_code = + can_fallback() ? RegExp::RE_FALLBACK_TO_EXPERIMENTAL : RegExp::RE_FAILURE; + Emit(BC_POP_BT, error_code); +} + +void RegExpBytecodeGenerator::GoTo(Label* l) { + if (advance_current_end_ == pc_) { + // Combine advance current and goto. + pc_ = advance_current_start_; + Emit(BC_ADVANCE_CP_AND_GOTO, advance_current_offset_); + EmitOrLink(l); + advance_current_end_ = kInvalidPC; + } else { + // Regular goto. + Emit(BC_GOTO, 0); + EmitOrLink(l); + } +} + +void RegExpBytecodeGenerator::PushBacktrack(Label* l) { + Emit(BC_PUSH_BT, 0); + EmitOrLink(l); +} + +bool RegExpBytecodeGenerator::Succeed() { + Emit(BC_SUCCEED, 0); + return false; // Restart matching for global regexp not supported. +} + +void RegExpBytecodeGenerator::Fail() { Emit(BC_FAIL, 0); } + +void RegExpBytecodeGenerator::AdvanceCurrentPosition(int by) { + // TODO(chromium:1166138): Turn back into DCHECKs once the underlying issue + // is fixed. + CHECK_LE(kMinCPOffset, by); + CHECK_GE(kMaxCPOffset, by); + advance_current_start_ = pc_; + advance_current_offset_ = by; + Emit(BC_ADVANCE_CP, by); + advance_current_end_ = pc_; +} + +void RegExpBytecodeGenerator::CheckGreedyLoop( + Label* on_tos_equals_current_position) { + Emit(BC_CHECK_GREEDY, 0); + EmitOrLink(on_tos_equals_current_position); +} + +void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset, + Label* on_failure, + bool check_bounds, + int characters, + int eats_at_least) { + DCHECK_GE(eats_at_least, characters); + if (eats_at_least > characters && check_bounds) { + DCHECK(is_int24(cp_offset + eats_at_least)); + Emit(BC_CHECK_CURRENT_POSITION, cp_offset + eats_at_least); + EmitOrLink(on_failure); + check_bounds = false; // Load below doesn't need to check. + } + + DCHECK_LE(kMinCPOffset, cp_offset); + DCHECK_GE(kMaxCPOffset, cp_offset); + int bytecode; + if (check_bounds) { + if (characters == 4) { + bytecode = BC_LOAD_4_CURRENT_CHARS; + } else if (characters == 2) { + bytecode = BC_LOAD_2_CURRENT_CHARS; + } else { + DCHECK_EQ(1, characters); + bytecode = BC_LOAD_CURRENT_CHAR; + } + } else { + if (characters == 4) { + bytecode = BC_LOAD_4_CURRENT_CHARS_UNCHECKED; + } else if (characters == 2) { + bytecode = BC_LOAD_2_CURRENT_CHARS_UNCHECKED; + } else { + DCHECK_EQ(1, characters); + bytecode = BC_LOAD_CURRENT_CHAR_UNCHECKED; + } + } + Emit(bytecode, cp_offset); + if (check_bounds) EmitOrLink(on_failure); +} + +void RegExpBytecodeGenerator::CheckCharacterLT(base::uc16 limit, + Label* on_less) { + Emit(BC_CHECK_LT, limit); + EmitOrLink(on_less); +} + +void RegExpBytecodeGenerator::CheckCharacterGT(base::uc16 limit, + Label* on_greater) { + Emit(BC_CHECK_GT, limit); + EmitOrLink(on_greater); +} + +void RegExpBytecodeGenerator::CheckCharacter(uint32_t c, Label* on_equal) { + if (c > MAX_FIRST_ARG) { + Emit(BC_CHECK_4_CHARS, 0); + Emit32(c); + } else { + Emit(BC_CHECK_CHAR, c); + } + EmitOrLink(on_equal); +} + +void RegExpBytecodeGenerator::CheckAtStart(int cp_offset, Label* on_at_start) { + Emit(BC_CHECK_AT_START, cp_offset); + EmitOrLink(on_at_start); +} + +void RegExpBytecodeGenerator::CheckNotAtStart(int cp_offset, + Label* on_not_at_start) { + Emit(BC_CHECK_NOT_AT_START, cp_offset); + EmitOrLink(on_not_at_start); +} + +void RegExpBytecodeGenerator::CheckNotCharacter(uint32_t c, + Label* on_not_equal) { + if (c > MAX_FIRST_ARG) { + Emit(BC_CHECK_NOT_4_CHARS, 0); + Emit32(c); + } else { + Emit(BC_CHECK_NOT_CHAR, c); + } + EmitOrLink(on_not_equal); +} + +void RegExpBytecodeGenerator::CheckCharacterAfterAnd(uint32_t c, uint32_t mask, + Label* on_equal) { + if (c > MAX_FIRST_ARG) { + Emit(BC_AND_CHECK_4_CHARS, 0); + Emit32(c); + } else { + Emit(BC_AND_CHECK_CHAR, c); + } + Emit32(mask); + EmitOrLink(on_equal); +} + +void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c, + uint32_t mask, + Label* on_not_equal) { + if (c > MAX_FIRST_ARG) { + Emit(BC_AND_CHECK_NOT_4_CHARS, 0); + Emit32(c); + } else { + Emit(BC_AND_CHECK_NOT_CHAR, c); + } + Emit32(mask); + EmitOrLink(on_not_equal); +} + +void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd( + base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) { + Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c); + Emit16(minus); + Emit16(mask); + EmitOrLink(on_not_equal); +} + +void RegExpBytecodeGenerator::CheckCharacterInRange(base::uc16 from, + base::uc16 to, + Label* on_in_range) { + Emit(BC_CHECK_CHAR_IN_RANGE, 0); + Emit16(from); + Emit16(to); + EmitOrLink(on_in_range); +} + +void RegExpBytecodeGenerator::CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, + Label* on_not_in_range) { + Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0); + Emit16(from); + Emit16(to); + EmitOrLink(on_not_in_range); +} + +void RegExpBytecodeGenerator::CheckBitInTable(Handle<ByteArray> table, + Label* on_bit_set) { + Emit(BC_CHECK_BIT_IN_TABLE, 0); + EmitOrLink(on_bit_set); + for (int i = 0; i < kTableSize; i += kBitsPerByte) { + int byte = 0; + for (int j = 0; j < kBitsPerByte; j++) { + if (table->get(i + j) != 0) byte |= 1 << j; + } + Emit8(byte); + } +} + +void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg, + bool read_backward, + Label* on_not_equal) { + DCHECK_LE(0, start_reg); + DCHECK_GE(kMaxRegister, start_reg); + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF, + start_reg); + EmitOrLink(on_not_equal); +} + +void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase( + int start_reg, bool read_backward, bool unicode, Label* on_not_equal) { + DCHECK_LE(0, start_reg); + DCHECK_GE(kMaxRegister, start_reg); + Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD + : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) + : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE + : BC_CHECK_NOT_BACK_REF_NO_CASE), + start_reg); + EmitOrLink(on_not_equal); +} + +void RegExpBytecodeGenerator::IfRegisterLT(int register_index, int comparand, + Label* on_less_than) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_CHECK_REGISTER_LT, register_index); + Emit32(comparand); + EmitOrLink(on_less_than); +} + +void RegExpBytecodeGenerator::IfRegisterGE(int register_index, int comparand, + Label* on_greater_or_equal) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_CHECK_REGISTER_GE, register_index); + Emit32(comparand); + EmitOrLink(on_greater_or_equal); +} + +void RegExpBytecodeGenerator::IfRegisterEqPos(int register_index, + Label* on_eq) { + DCHECK_LE(0, register_index); + DCHECK_GE(kMaxRegister, register_index); + Emit(BC_CHECK_REGISTER_EQ_POS, register_index); + EmitOrLink(on_eq); +} + +Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) { + Bind(&backtrack_); + Backtrack(); + + Handle<ByteArray> array; + if (v8_flags.regexp_peephole_optimization) { + array = RegExpBytecodePeepholeOptimization::OptimizeBytecode( + isolate_, zone(), source, buffer_.data(), length(), jump_edges_); + } else { + array = isolate_->factory()->NewByteArray(length()); + Copy(array->GetDataStartAddress()); + } + + return array; +} + +int RegExpBytecodeGenerator::length() { return pc_; } + +void RegExpBytecodeGenerator::Copy(byte* a) { + MemCopy(a, buffer_.data(), length()); +} + +void RegExpBytecodeGenerator::ExpandBuffer() { + // TODO(jgruber): The growth strategy could be smarter for large sizes. + // TODO(jgruber): It's not necessary to default-initialize new elements. + buffer_.resize(buffer_.size() * 2); +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.h b/js/src/irregexp/imported/regexp-bytecode-generator.h new file mode 100644 index 0000000000..351f6e0cc6 --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecode-generator.h @@ -0,0 +1,140 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_ +#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_ + +#include "irregexp/imported/regexp-macro-assembler.h" + +namespace v8 { +namespace internal { + +// An assembler/generator for the Irregexp byte code. +class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { + public: + // Create an assembler. Instructions and relocation information are emitted + // into a buffer, with the instructions starting from the beginning and the + // relocation information starting from the end of the buffer. See CodeDesc + // for a detailed comment on the layout (globals.h). + // + // The assembler allocates and grows its own buffer, and buffer_size + // determines the initial buffer size. The buffer is owned by the assembler + // and deallocated upon destruction of the assembler. + RegExpBytecodeGenerator(Isolate* isolate, Zone* zone); + ~RegExpBytecodeGenerator() override; + // The byte-code interpreter checks on each push anyway. + int stack_limit_slack() override { return 1; } + bool CanReadUnaligned() const override { return false; } + void Bind(Label* label) override; + void AdvanceCurrentPosition(int by) override; // Signed cp change. + void PopCurrentPosition() override; + void PushCurrentPosition() override; + void Backtrack() override; + void GoTo(Label* label) override; + void PushBacktrack(Label* label) override; + bool Succeed() override; + void Fail() override; + void PopRegister(int register_index) override; + void PushRegister(int register_index, + StackCheckFlag check_stack_limit) override; + void AdvanceRegister(int reg, int by) override; // r[reg] += by. + void SetCurrentPositionFromEnd(int by) override; + void SetRegister(int register_index, int to) override; + void WriteCurrentPositionToRegister(int reg, int cp_offset) override; + void ClearRegisters(int reg_from, int reg_to) override; + void ReadCurrentPositionFromRegister(int reg) override; + void WriteStackPointerToRegister(int reg) override; + void ReadStackPointerFromRegister(int reg) override; + void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, + bool check_bounds, int characters, + int eats_at_least) override; + void CheckCharacter(unsigned c, Label* on_equal) override; + void CheckCharacterAfterAnd(unsigned c, unsigned mask, + Label* on_equal) override; + void CheckCharacterGT(base::uc16 limit, Label* on_greater) override; + void CheckCharacterLT(base::uc16 limit, Label* on_less) override; + void CheckGreedyLoop(Label* on_tos_equals_current_position) override; + void CheckAtStart(int cp_offset, Label* on_at_start) override; + void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override; + void CheckNotCharacter(unsigned c, Label* on_not_equal) override; + void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, + Label* on_not_equal) override; + void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 mask, + Label* on_not_equal) override; + void CheckCharacterInRange(base::uc16 from, base::uc16 to, + Label* on_in_range) override; + void CheckCharacterNotInRange(base::uc16 from, base::uc16 to, + Label* on_not_in_range) override; + bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges, + Label* on_in_range) override { + // Disabled in the interpreter, because 1) there is no constant pool that + // could store the ByteArray pointer, 2) bytecode size limits are not as + // restrictive as code (e.g. branch distances on arm), 3) bytecode for + // large character classes is already quite compact. + // TODO(jgruber): Consider using BytecodeArrays (with a constant pool) + // instead of plain ByteArrays; then we could implement + // CheckCharacterInRangeArray in the interpreter. + return false; + } + bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges, + Label* on_not_in_range) override { + return false; + } + void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override; + void CheckNotBackReference(int start_reg, bool read_backward, + Label* on_no_match) override; + void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + bool unicode, + Label* on_no_match) override; + void IfRegisterLT(int register_index, int comparand, Label* if_lt) override; + void IfRegisterGE(int register_index, int comparand, Label* if_ge) override; + void IfRegisterEqPos(int register_index, Label* if_eq) override; + + IrregexpImplementation Implementation() override; + Handle<HeapObject> GetCode(Handle<String> source) override; + + private: + void ExpandBuffer(); + + // Code and bitmap emission. + inline void EmitOrLink(Label* label); + inline void Emit32(uint32_t x); + inline void Emit16(uint32_t x); + inline void Emit8(uint32_t x); + inline void Emit(uint32_t bc, uint32_t arg); + inline void Emit(uint32_t bc, int32_t arg); + // Bytecode buffer. + int length(); + void Copy(byte* a); + + // The buffer into which code and relocation info are generated. + static constexpr int kInitialBufferSize = 1024; + ZoneVector<byte> buffer_; + + // The program counter. + int pc_; + Label backtrack_; + + int advance_current_start_; + int advance_current_offset_; + int advance_current_end_; + + // Stores jump edges emitted for the bytecode (used by + // RegExpBytecodePeepholeOptimization). + // Key: jump source (offset in buffer_ where jump destination is stored). + // Value: jump destination (offset in buffer_ to jump to). + ZoneUnorderedMap<int, int> jump_edges_; + + Isolate* isolate_; + + static const int kInvalidPC = -1; + + DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodeGenerator); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_ diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.cc b/js/src/irregexp/imported/regexp-bytecode-peephole.cc new file mode 100644 index 0000000000..9e49bfbeca --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecode-peephole.cc @@ -0,0 +1,1027 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-bytecode-peephole.h" + +#include "irregexp/imported/regexp-bytecodes.h" + +namespace v8 { +namespace internal { + +namespace { + +struct BytecodeArgument { + int offset; + int length; + + BytecodeArgument(int offset, int length) : offset(offset), length(length) {} +}; + +struct BytecodeArgumentMapping : BytecodeArgument { + int new_length; + + BytecodeArgumentMapping(int offset, int length, int new_length) + : BytecodeArgument(offset, length), new_length(new_length) {} +}; + +struct BytecodeArgumentCheck : BytecodeArgument { + enum CheckType { kCheckAddress = 0, kCheckValue }; + CheckType type; + int check_offset; + int check_length; + + BytecodeArgumentCheck(int offset, int length, int check_offset) + : BytecodeArgument(offset, length), + type(kCheckAddress), + check_offset(check_offset) {} + BytecodeArgumentCheck(int offset, int length, int check_offset, + int check_length) + : BytecodeArgument(offset, length), + type(kCheckValue), + check_offset(check_offset), + check_length(check_length) {} +}; + +// Trie-Node for storing bytecode sequences we want to optimize. +class BytecodeSequenceNode { + public: + // Dummy bytecode used when we need to store/return a bytecode but it's not a + // valid bytecode in the current context. + static constexpr int kDummyBytecode = -1; + + BytecodeSequenceNode(int bytecode, Zone* zone); + // Adds a new node as child of the current node if it isn't a child already. + BytecodeSequenceNode& FollowedBy(int bytecode); + // Marks the end of a sequence and sets optimized bytecode to replace all + // bytecodes of the sequence with. + BytecodeSequenceNode& ReplaceWith(int bytecode); + // Maps arguments of bytecodes in the sequence to the optimized bytecode. + // Order of invocation determines order of arguments in the optimized + // bytecode. + // Invoking this method is only allowed on nodes that mark the end of a valid + // sequence (i.e. after ReplaceWith()). + // bytecode_index_in_sequence: Zero-based index of the referred bytecode + // within the sequence (e.g. the bytecode passed to CreateSequence() has + // index 0). + // argument_offset: Zero-based offset to the argument within the bytecode + // (e.g. the first argument that's not packed with the bytecode has offset 4). + // argument_byte_length: Length of the argument. + // new_argument_byte_length: Length of the argument in the new bytecode + // (= argument_byte_length if omitted). + BytecodeSequenceNode& MapArgument(int bytecode_index_in_sequence, + int argument_offset, + int argument_byte_length, + int new_argument_byte_length = 0); + // Adds a check to the sequence node making it only a valid sequence when the + // argument of the current bytecode at the specified offset matches the offset + // to check against. + // argument_offset: Zero-based offset to the argument within the bytecode + // (e.g. the first argument that's not packed with the bytecode has offset 4). + // argument_byte_length: Length of the argument. + // check_byte_offset: Zero-based offset relative to the beginning of the + // sequence that needs to match the value given by argument_offset. (e.g. + // check_byte_offset 0 matches the address of the first bytecode in the + // sequence). + BytecodeSequenceNode& IfArgumentEqualsOffset(int argument_offset, + int argument_byte_length, + int check_byte_offset); + // Adds a check to the sequence node making it only a valid sequence when the + // argument of the current bytecode at the specified offset matches the + // argument of another bytecode in the sequence. + // This is similar to IfArgumentEqualsOffset, except that this method matches + // the values of both arguments. + BytecodeSequenceNode& IfArgumentEqualsValueAtOffset( + int argument_offset, int argument_byte_length, + int other_bytecode_index_in_sequence, int other_argument_offset, + int other_argument_byte_length); + // Marks an argument as unused. + // All arguments that are not mapped explicitly have to be marked as unused. + // bytecode_index_in_sequence: Zero-based index of the referred bytecode + // within the sequence (e.g. the bytecode passed to CreateSequence() has + // index 0). + // argument_offset: Zero-based offset to the argument within the bytecode + // (e.g. the first argument that's not packed with the bytecode has offset 4). + // argument_byte_length: Length of the argument. + BytecodeSequenceNode& IgnoreArgument(int bytecode_index_in_sequence, + int argument_offset, + int argument_byte_length); + // Checks if the current node is valid for the sequence. I.e. all conditions + // set by IfArgumentEqualsOffset and IfArgumentEquals are fulfilled by this + // node for the actual bytecode sequence. + bool CheckArguments(const byte* bytecode, int pc); + // Returns whether this node marks the end of a valid sequence (i.e. can be + // replaced with an optimized bytecode). + bool IsSequence() const; + // Returns the length of the sequence in bytes. + int SequenceLength() const; + // Returns the optimized bytecode for the node or kDummyBytecode if it is not + // the end of a valid sequence. + int OptimizedBytecode() const; + // Returns the child of the current node matching the given bytecode or + // nullptr if no such child is found. + BytecodeSequenceNode* Find(int bytecode) const; + // Returns number of arguments mapped to the current node. + // Invoking this method is only allowed on nodes that mark the end of a valid + // sequence (i.e. if IsSequence()) + size_t ArgumentSize() const; + // Returns the argument-mapping of the argument at index. + // Invoking this method is only allowed on nodes that mark the end of a valid + // sequence (i.e. if IsSequence()) + BytecodeArgumentMapping ArgumentMapping(size_t index) const; + // Returns an iterator to begin of ignored arguments. + // Invoking this method is only allowed on nodes that mark the end of a valid + // sequence (i.e. if IsSequence()) + ZoneLinkedList<BytecodeArgument>::iterator ArgumentIgnoredBegin() const; + // Returns an iterator to end of ignored arguments. + // Invoking this method is only allowed on nodes that mark the end of a valid + // sequence (i.e. if IsSequence()) + ZoneLinkedList<BytecodeArgument>::iterator ArgumentIgnoredEnd() const; + // Returns whether the current node has ignored argument or not. + bool HasIgnoredArguments() const; + + private: + // Returns a node in the sequence specified by its index within the sequence. + BytecodeSequenceNode& GetNodeByIndexInSequence(int index_in_sequence); + Zone* zone() const; + + int bytecode_; + int bytecode_replacement_; + int index_in_sequence_; + int start_offset_; + BytecodeSequenceNode* parent_; + ZoneUnorderedMap<int, BytecodeSequenceNode*> children_; + ZoneVector<BytecodeArgumentMapping>* argument_mapping_; + ZoneLinkedList<BytecodeArgumentCheck>* argument_check_; + ZoneLinkedList<BytecodeArgument>* argument_ignored_; + + Zone* zone_; +}; + +// These definitions are here in order to please the linker, which in debug mode +// sometimes requires static constants to be defined in .cc files. +constexpr int BytecodeSequenceNode::kDummyBytecode; + +class RegExpBytecodePeephole { + public: + RegExpBytecodePeephole(Zone* zone, size_t buffer_size, + const ZoneUnorderedMap<int, int>& jump_edges); + + // Parses bytecode and fills the internal buffer with the potentially + // optimized bytecode. Returns true when optimizations were performed, false + // otherwise. + bool OptimizeBytecode(const byte* bytecode, int length); + // Copies the internal bytecode buffer to another buffer. The caller is + // responsible for allocating/freeing the memory. + void CopyOptimizedBytecode(byte* to_address) const; + int Length() const; + + private: + // Sets up all sequences that are going to be used. + void DefineStandardSequences(); + // Starts a new bytecode sequence. + BytecodeSequenceNode& CreateSequence(int bytecode); + // Checks for optimization candidates at pc and emits optimized bytecode to + // the internal buffer. Returns the length of replaced bytecodes in bytes. + int TryOptimizeSequence(const byte* bytecode, int bytecode_length, + int start_pc); + // Emits optimized bytecode to the internal buffer. start_pc points to the + // start of the sequence in bytecode and last_node is the last + // BytecodeSequenceNode of the matching sequence found. + void EmitOptimization(int start_pc, const byte* bytecode, + const BytecodeSequenceNode& last_node); + // Adds a relative jump source fixup at pos. + // Jump source fixups are used to find offsets in the new bytecode that + // contain jump sources. + void AddJumpSourceFixup(int fixup, int pos); + // Adds a relative jump destination fixup at pos. + // Jump destination fixups are used to find offsets in the new bytecode that + // can be jumped to. + void AddJumpDestinationFixup(int fixup, int pos); + // Sets an absolute jump destination fixup at pos. + void SetJumpDestinationFixup(int fixup, int pos); + // Prepare internal structures used to fixup jumps. + void PrepareJumpStructures(const ZoneUnorderedMap<int, int>& jump_edges); + // Updates all jump targets in the new bytecode. + void FixJumps(); + // Update a single jump. + void FixJump(int jump_source, int jump_destination); + void AddSentinelFixups(int pos); + template <typename T> + void EmitValue(T value); + template <typename T> + void OverwriteValue(int offset, T value); + void CopyRangeToOutput(const byte* orig_bytecode, int start, int length); + void SetRange(byte value, int count); + void EmitArgument(int start_pc, const byte* bytecode, + BytecodeArgumentMapping arg); + int pc() const; + Zone* zone() const; + + ZoneVector<byte> optimized_bytecode_buffer_; + BytecodeSequenceNode* sequences_; + // Jumps used in old bytecode. + // Key: Jump source (offset where destination is stored in old bytecode) + // Value: Destination + ZoneMap<int, int> jump_edges_; + // Jumps used in new bytecode. + // Key: Jump source (offset where destination is stored in new bytecode) + // Value: Destination + ZoneMap<int, int> jump_edges_mapped_; + // Number of times a jump destination is used within the bytecode. + // Key: Jump destination (offset in old bytecode). + // Value: Number of times jump destination is used. + ZoneMap<int, int> jump_usage_counts_; + // Maps offsets in old bytecode to fixups of sources (delta to new bytecode). + // Key: Offset in old bytecode from where the fixup is valid. + // Value: Delta to map jump source from old bytecode to new bytecode in bytes. + ZoneMap<int, int> jump_source_fixups_; + // Maps offsets in old bytecode to fixups of destinations (delta to new + // bytecode). + // Key: Offset in old bytecode from where the fixup is valid. + // Value: Delta to map jump destinations from old bytecode to new bytecode in + // bytes. + ZoneMap<int, int> jump_destination_fixups_; + + Zone* zone_; + + DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodePeephole); +}; + +template <typename T> +T GetValue(const byte* buffer, int pos) { + DCHECK(IsAligned(reinterpret_cast<Address>(buffer + pos), alignof(T))); + return *reinterpret_cast<const T*>(buffer + pos); +} + +int32_t GetArgumentValue(const byte* bytecode, int offset, int length) { + switch (length) { + case 1: + return GetValue<byte>(bytecode, offset); + case 2: + return GetValue<int16_t>(bytecode, offset); + case 4: + return GetValue<int32_t>(bytecode, offset); + default: + UNREACHABLE(); + } +} + +BytecodeSequenceNode::BytecodeSequenceNode(int bytecode, Zone* zone) + : bytecode_(bytecode), + bytecode_replacement_(kDummyBytecode), + index_in_sequence_(0), + start_offset_(0), + parent_(nullptr), + children_(ZoneUnorderedMap<int, BytecodeSequenceNode*>(zone)), + argument_mapping_(zone->New<ZoneVector<BytecodeArgumentMapping>>(zone)), + argument_check_(zone->New<ZoneLinkedList<BytecodeArgumentCheck>>(zone)), + argument_ignored_(zone->New<ZoneLinkedList<BytecodeArgument>>(zone)), + zone_(zone) {} + +BytecodeSequenceNode& BytecodeSequenceNode::FollowedBy(int bytecode) { + DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount); + + if (children_.find(bytecode) == children_.end()) { + BytecodeSequenceNode* new_node = + zone()->New<BytecodeSequenceNode>(bytecode, zone()); + // If node is not the first in the sequence, set offsets and parent. + if (bytecode_ != kDummyBytecode) { + new_node->start_offset_ = start_offset_ + RegExpBytecodeLength(bytecode_); + new_node->index_in_sequence_ = index_in_sequence_ + 1; + new_node->parent_ = this; + } + children_[bytecode] = new_node; + } + + return *children_[bytecode]; +} + +BytecodeSequenceNode& BytecodeSequenceNode::ReplaceWith(int bytecode) { + DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount); + + bytecode_replacement_ = bytecode; + + return *this; +} + +BytecodeSequenceNode& BytecodeSequenceNode::MapArgument( + int bytecode_index_in_sequence, int argument_offset, + int argument_byte_length, int new_argument_byte_length) { + DCHECK(IsSequence()); + DCHECK_LE(bytecode_index_in_sequence, index_in_sequence_); + + BytecodeSequenceNode& ref_node = + GetNodeByIndexInSequence(bytecode_index_in_sequence); + DCHECK_LT(argument_offset, RegExpBytecodeLength(ref_node.bytecode_)); + + int absolute_offset = ref_node.start_offset_ + argument_offset; + if (new_argument_byte_length == 0) { + new_argument_byte_length = argument_byte_length; + } + + argument_mapping_->push_back(BytecodeArgumentMapping{ + absolute_offset, argument_byte_length, new_argument_byte_length}); + + return *this; +} + +BytecodeSequenceNode& BytecodeSequenceNode::IfArgumentEqualsOffset( + int argument_offset, int argument_byte_length, int check_byte_offset) { + DCHECK_LT(argument_offset, RegExpBytecodeLength(bytecode_)); + DCHECK(argument_byte_length == 1 || argument_byte_length == 2 || + argument_byte_length == 4); + + int absolute_offset = start_offset_ + argument_offset; + + argument_check_->push_back(BytecodeArgumentCheck{ + absolute_offset, argument_byte_length, check_byte_offset}); + + return *this; +} + +BytecodeSequenceNode& BytecodeSequenceNode::IfArgumentEqualsValueAtOffset( + int argument_offset, int argument_byte_length, + int other_bytecode_index_in_sequence, int other_argument_offset, + int other_argument_byte_length) { + DCHECK_LT(argument_offset, RegExpBytecodeLength(bytecode_)); + DCHECK_LE(other_bytecode_index_in_sequence, index_in_sequence_); + DCHECK_EQ(argument_byte_length, other_argument_byte_length); + + BytecodeSequenceNode& ref_node = + GetNodeByIndexInSequence(other_bytecode_index_in_sequence); + DCHECK_LT(other_argument_offset, RegExpBytecodeLength(ref_node.bytecode_)); + + int absolute_offset = start_offset_ + argument_offset; + int other_absolute_offset = ref_node.start_offset_ + other_argument_offset; + + argument_check_->push_back( + BytecodeArgumentCheck{absolute_offset, argument_byte_length, + other_absolute_offset, other_argument_byte_length}); + + return *this; +} + +BytecodeSequenceNode& BytecodeSequenceNode::IgnoreArgument( + int bytecode_index_in_sequence, int argument_offset, + int argument_byte_length) { + DCHECK(IsSequence()); + DCHECK_LE(bytecode_index_in_sequence, index_in_sequence_); + + BytecodeSequenceNode& ref_node = + GetNodeByIndexInSequence(bytecode_index_in_sequence); + DCHECK_LT(argument_offset, RegExpBytecodeLength(ref_node.bytecode_)); + + int absolute_offset = ref_node.start_offset_ + argument_offset; + + argument_ignored_->push_back( + BytecodeArgument{absolute_offset, argument_byte_length}); + + return *this; +} + +bool BytecodeSequenceNode::CheckArguments(const byte* bytecode, int pc) { + bool is_valid = true; + for (auto check_iter = argument_check_->begin(); + check_iter != argument_check_->end() && is_valid; check_iter++) { + auto value = + GetArgumentValue(bytecode, pc + check_iter->offset, check_iter->length); + if (check_iter->type == BytecodeArgumentCheck::kCheckAddress) { + is_valid &= value == pc + check_iter->check_offset; + } else if (check_iter->type == BytecodeArgumentCheck::kCheckValue) { + auto other_value = GetArgumentValue( + bytecode, pc + check_iter->check_offset, check_iter->check_length); + is_valid &= value == other_value; + } else { + UNREACHABLE(); + } + } + return is_valid; +} + +bool BytecodeSequenceNode::IsSequence() const { + return bytecode_replacement_ != kDummyBytecode; +} + +int BytecodeSequenceNode::SequenceLength() const { + return start_offset_ + RegExpBytecodeLength(bytecode_); +} + +int BytecodeSequenceNode::OptimizedBytecode() const { + return bytecode_replacement_; +} + +BytecodeSequenceNode* BytecodeSequenceNode::Find(int bytecode) const { + auto found = children_.find(bytecode); + if (found == children_.end()) return nullptr; + return found->second; +} + +size_t BytecodeSequenceNode::ArgumentSize() const { + DCHECK(IsSequence()); + return argument_mapping_->size(); +} + +BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping( + size_t index) const { + DCHECK(IsSequence()); + DCHECK(argument_mapping_ != nullptr); + DCHECK_LT(index, argument_mapping_->size()); + + return argument_mapping_->at(index); +} + +ZoneLinkedList<BytecodeArgument>::iterator +BytecodeSequenceNode::ArgumentIgnoredBegin() const { + DCHECK(IsSequence()); + DCHECK(argument_ignored_ != nullptr); + return argument_ignored_->begin(); +} + +ZoneLinkedList<BytecodeArgument>::iterator +BytecodeSequenceNode::ArgumentIgnoredEnd() const { + DCHECK(IsSequence()); + DCHECK(argument_ignored_ != nullptr); + return argument_ignored_->end(); +} + +bool BytecodeSequenceNode::HasIgnoredArguments() const { + return argument_ignored_ != nullptr; +} + +BytecodeSequenceNode& BytecodeSequenceNode::GetNodeByIndexInSequence( + int index_in_sequence) { + DCHECK_LE(index_in_sequence, index_in_sequence_); + + if (index_in_sequence < index_in_sequence_) { + DCHECK(parent_ != nullptr); + return parent_->GetNodeByIndexInSequence(index_in_sequence); + } else { + return *this; + } +} + +Zone* BytecodeSequenceNode::zone() const { return zone_; } + +RegExpBytecodePeephole::RegExpBytecodePeephole( + Zone* zone, size_t buffer_size, + const ZoneUnorderedMap<int, int>& jump_edges) + : optimized_bytecode_buffer_(zone), + sequences_(zone->New<BytecodeSequenceNode>( + BytecodeSequenceNode::kDummyBytecode, zone)), + jump_edges_(zone), + jump_edges_mapped_(zone), + jump_usage_counts_(zone), + jump_source_fixups_(zone), + jump_destination_fixups_(zone), + zone_(zone) { + optimized_bytecode_buffer_.reserve(buffer_size); + PrepareJumpStructures(jump_edges); + DefineStandardSequences(); + // Sentinel fixups at beginning of bytecode (position -1) so we don't have to + // check for end of iterator inside the fixup loop. + // In general fixups are deltas of original offsets of jump + // sources/destinations (in the old bytecode) to find them in the new + // bytecode. All jump targets are fixed after the new bytecode is fully + // emitted in the internal buffer. + AddSentinelFixups(-1); + // Sentinel fixups at end of (old) bytecode so we don't have to check for + // end of iterator inside the fixup loop. + DCHECK_LE(buffer_size, std::numeric_limits<int>::max()); + AddSentinelFixups(static_cast<int>(buffer_size)); +} + +void RegExpBytecodePeephole::DefineStandardSequences() { + // Commonly used sequences can be found by creating regexp bytecode traces + // (--trace-regexp-bytecodes) and using v8/tools/regexp-sequences.py. + CreateSequence(BC_LOAD_CURRENT_CHAR) + .FollowedBy(BC_CHECK_BIT_IN_TABLE) + .FollowedBy(BC_ADVANCE_CP_AND_GOTO) + // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the + // first bytecode in this sequence. + .IfArgumentEqualsOffset(4, 4, 0) + .ReplaceWith(BC_SKIP_UNTIL_BIT_IN_TABLE) + .MapArgument(0, 1, 3) // load offset + .MapArgument(2, 1, 3, 4) // advance by + .MapArgument(1, 8, 16) // bit table + .MapArgument(1, 4, 4) // goto when match + .MapArgument(0, 4, 4) // goto on failure + .IgnoreArgument(2, 4, 4); // loop jump + + CreateSequence(BC_CHECK_CURRENT_POSITION) + .FollowedBy(BC_LOAD_CURRENT_CHAR_UNCHECKED) + .FollowedBy(BC_CHECK_CHAR) + .FollowedBy(BC_ADVANCE_CP_AND_GOTO) + // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the + // first bytecode in this sequence. + .IfArgumentEqualsOffset(4, 4, 0) + .ReplaceWith(BC_SKIP_UNTIL_CHAR_POS_CHECKED) + .MapArgument(1, 1, 3) // load offset + .MapArgument(3, 1, 3, 2) // advance_by + .MapArgument(2, 1, 3, 2) // c + .MapArgument(0, 1, 3, 4) // eats at least + .MapArgument(2, 4, 4) // goto when match + .MapArgument(0, 4, 4) // goto on failure + .IgnoreArgument(3, 4, 4); // loop jump + + CreateSequence(BC_CHECK_CURRENT_POSITION) + .FollowedBy(BC_LOAD_CURRENT_CHAR_UNCHECKED) + .FollowedBy(BC_AND_CHECK_CHAR) + .FollowedBy(BC_ADVANCE_CP_AND_GOTO) + // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the + // first bytecode in this sequence. + .IfArgumentEqualsOffset(4, 4, 0) + .ReplaceWith(BC_SKIP_UNTIL_CHAR_AND) + .MapArgument(1, 1, 3) // load offset + .MapArgument(3, 1, 3, 2) // advance_by + .MapArgument(2, 1, 3, 2) // c + .MapArgument(2, 4, 4) // mask + .MapArgument(0, 1, 3, 4) // eats at least + .MapArgument(2, 8, 4) // goto when match + .MapArgument(0, 4, 4) // goto on failure + .IgnoreArgument(3, 4, 4); // loop jump + + // TODO(pthier): It might make sense for short sequences like this one to only + // optimize them if the resulting optimization is not longer than the current + // one. This could be the case if there are jumps inside the sequence and we + // have to replicate parts of the sequence. A method to mark such sequences + // might be useful. + CreateSequence(BC_LOAD_CURRENT_CHAR) + .FollowedBy(BC_CHECK_CHAR) + .FollowedBy(BC_ADVANCE_CP_AND_GOTO) + // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the + // first bytecode in this sequence. + .IfArgumentEqualsOffset(4, 4, 0) + .ReplaceWith(BC_SKIP_UNTIL_CHAR) + .MapArgument(0, 1, 3) // load offset + .MapArgument(2, 1, 3, 2) // advance by + .MapArgument(1, 1, 3, 2) // character + .MapArgument(1, 4, 4) // goto when match + .MapArgument(0, 4, 4) // goto on failure + .IgnoreArgument(2, 4, 4); // loop jump + + CreateSequence(BC_LOAD_CURRENT_CHAR) + .FollowedBy(BC_CHECK_CHAR) + .FollowedBy(BC_CHECK_CHAR) + // Sequence is only valid if the jump targets of both CHECK_CHAR bytecodes + // are equal. + .IfArgumentEqualsValueAtOffset(4, 4, 1, 4, 4) + .FollowedBy(BC_ADVANCE_CP_AND_GOTO) + // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the + // first bytecode in this sequence. + .IfArgumentEqualsOffset(4, 4, 0) + .ReplaceWith(BC_SKIP_UNTIL_CHAR_OR_CHAR) + .MapArgument(0, 1, 3) // load offset + .MapArgument(3, 1, 3, 4) // advance by + .MapArgument(1, 1, 3, 2) // character 1 + .MapArgument(2, 1, 3, 2) // character 2 + .MapArgument(1, 4, 4) // goto when match + .MapArgument(0, 4, 4) // goto on failure + .IgnoreArgument(2, 4, 4) // goto when match 2 + .IgnoreArgument(3, 4, 4); // loop jump + + CreateSequence(BC_LOAD_CURRENT_CHAR) + .FollowedBy(BC_CHECK_GT) + // Sequence is only valid if the jump target of CHECK_GT is the first + // bytecode AFTER the whole sequence. + .IfArgumentEqualsOffset(4, 4, 56) + .FollowedBy(BC_CHECK_BIT_IN_TABLE) + // Sequence is only valid if the jump target of CHECK_BIT_IN_TABLE is + // the ADVANCE_CP_AND_GOTO bytecode at the end of the sequence. + .IfArgumentEqualsOffset(4, 4, 48) + .FollowedBy(BC_GOTO) + // Sequence is only valid if the jump target of GOTO is the same as the + // jump target of CHECK_GT (i.e. both jump to the first bytecode AFTER the + // whole sequence. + .IfArgumentEqualsValueAtOffset(4, 4, 1, 4, 4) + .FollowedBy(BC_ADVANCE_CP_AND_GOTO) + // Sequence is only valid if the jump target of ADVANCE_CP_AND_GOTO is the + // first bytecode in this sequence. + .IfArgumentEqualsOffset(4, 4, 0) + .ReplaceWith(BC_SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) + .MapArgument(0, 1, 3) // load offset + .MapArgument(4, 1, 3, 2) // advance by + .MapArgument(1, 1, 3, 2) // character + .MapArgument(2, 8, 16) // bit table + .MapArgument(1, 4, 4) // goto when match + .MapArgument(0, 4, 4) // goto on failure + .IgnoreArgument(2, 4, 4) // indirect loop jump + .IgnoreArgument(3, 4, 4) // jump out of loop + .IgnoreArgument(4, 4, 4); // loop jump +} + +bool RegExpBytecodePeephole::OptimizeBytecode(const byte* bytecode, + int length) { + int old_pc = 0; + bool did_optimize = false; + + while (old_pc < length) { + int replaced_len = TryOptimizeSequence(bytecode, length, old_pc); + if (replaced_len > 0) { + old_pc += replaced_len; + did_optimize = true; + } else { + int bc = bytecode[old_pc]; + int bc_len = RegExpBytecodeLength(bc); + CopyRangeToOutput(bytecode, old_pc, bc_len); + old_pc += bc_len; + } + } + + if (did_optimize) { + FixJumps(); + } + + return did_optimize; +} + +void RegExpBytecodePeephole::CopyOptimizedBytecode(byte* to_address) const { + MemCopy(to_address, &(*optimized_bytecode_buffer_.begin()), Length()); +} + +int RegExpBytecodePeephole::Length() const { return pc(); } + +BytecodeSequenceNode& RegExpBytecodePeephole::CreateSequence(int bytecode) { + DCHECK(sequences_ != nullptr); + DCHECK(0 <= bytecode && bytecode < kRegExpBytecodeCount); + + return sequences_->FollowedBy(bytecode); +} + +int RegExpBytecodePeephole::TryOptimizeSequence(const byte* bytecode, + int bytecode_length, + int start_pc) { + BytecodeSequenceNode* seq_node = sequences_; + BytecodeSequenceNode* valid_seq_end = nullptr; + + int current_pc = start_pc; + + // Check for the longest valid sequence matching any of the pre-defined + // sequences in the Trie data structure. + while (current_pc < bytecode_length) { + seq_node = seq_node->Find(bytecode[current_pc]); + if (seq_node == nullptr) break; + if (!seq_node->CheckArguments(bytecode, start_pc)) break; + + if (seq_node->IsSequence()) valid_seq_end = seq_node; + current_pc += RegExpBytecodeLength(bytecode[current_pc]); + } + + if (valid_seq_end) { + EmitOptimization(start_pc, bytecode, *valid_seq_end); + return valid_seq_end->SequenceLength(); + } + + return 0; +} + +void RegExpBytecodePeephole::EmitOptimization( + int start_pc, const byte* bytecode, const BytecodeSequenceNode& last_node) { +#ifdef DEBUG + int optimized_start_pc = pc(); +#endif + // Jump sources that are mapped or marked as unused will be deleted at the end + // of this method. We don't delete them immediately as we might need the + // information when we have to preserve bytecodes at the end. + // TODO(pthier): Replace with a stack-allocated data structure. + ZoneLinkedList<int> delete_jumps = ZoneLinkedList<int>(zone()); + + uint32_t bc = last_node.OptimizedBytecode(); + EmitValue(bc); + + for (size_t arg = 0; arg < last_node.ArgumentSize(); arg++) { + BytecodeArgumentMapping arg_map = last_node.ArgumentMapping(arg); + int arg_pos = start_pc + arg_map.offset; + // If we map any jump source we mark the old source for deletion and insert + // a new jump. + auto jump_edge_iter = jump_edges_.find(arg_pos); + if (jump_edge_iter != jump_edges_.end()) { + int jump_source = jump_edge_iter->first; + int jump_destination = jump_edge_iter->second; + // Add new jump edge add current position. + jump_edges_mapped_.emplace(Length(), jump_destination); + // Mark old jump edge for deletion. + delete_jumps.push_back(jump_source); + // Decrement usage count of jump destination. + auto jump_count_iter = jump_usage_counts_.find(jump_destination); + DCHECK(jump_count_iter != jump_usage_counts_.end()); + int& usage_count = jump_count_iter->second; + --usage_count; + } + // TODO(pthier): DCHECK that mapped arguments are never sources of jumps + // to destinations inside the sequence. + EmitArgument(start_pc, bytecode, arg_map); + } + DCHECK_EQ(pc(), optimized_start_pc + + RegExpBytecodeLength(last_node.OptimizedBytecode())); + + // Remove jumps from arguments we ignore. + if (last_node.HasIgnoredArguments()) { + for (auto ignored_arg = last_node.ArgumentIgnoredBegin(); + ignored_arg != last_node.ArgumentIgnoredEnd(); ignored_arg++) { + auto jump_edge_iter = jump_edges_.find(start_pc + ignored_arg->offset); + if (jump_edge_iter != jump_edges_.end()) { + int jump_source = jump_edge_iter->first; + int jump_destination = jump_edge_iter->second; + // Mark old jump edge for deletion. + delete_jumps.push_back(jump_source); + // Decrement usage count of jump destination. + auto jump_count_iter = jump_usage_counts_.find(jump_destination); + DCHECK(jump_count_iter != jump_usage_counts_.end()); + int& usage_count = jump_count_iter->second; + --usage_count; + } + } + } + + int fixup_length = RegExpBytecodeLength(bc) - last_node.SequenceLength(); + + // Check if there are any jumps inside the old sequence. + // If so we have to keep the bytecodes that are jumped to around. + auto jump_destination_candidate = jump_usage_counts_.upper_bound(start_pc); + int jump_candidate_destination = jump_destination_candidate->first; + int jump_candidate_count = jump_destination_candidate->second; + // Jump destinations only jumped to from inside the sequence will be ignored. + while (jump_destination_candidate != jump_usage_counts_.end() && + jump_candidate_count == 0) { + ++jump_destination_candidate; + jump_candidate_destination = jump_destination_candidate->first; + jump_candidate_count = jump_destination_candidate->second; + } + + int preserve_from = start_pc + last_node.SequenceLength(); + if (jump_destination_candidate != jump_usage_counts_.end() && + jump_candidate_destination < start_pc + last_node.SequenceLength()) { + preserve_from = jump_candidate_destination; + // Check if any jump in the sequence we are preserving has a jump + // destination inside the optimized sequence before the current position we + // want to preserve. If so we have to preserve all bytecodes starting at + // this jump destination. + for (auto jump_iter = jump_edges_.lower_bound(preserve_from); + jump_iter != jump_edges_.end() && + jump_iter->first /* jump source */ < + start_pc + last_node.SequenceLength(); + ++jump_iter) { + int jump_destination = jump_iter->second; + if (jump_destination > start_pc && jump_destination < preserve_from) { + preserve_from = jump_destination; + } + } + + // We preserve everything to the end of the sequence. This is conservative + // since it would be enough to preserve all bytecudes up to an unconditional + // jump. + int preserve_length = start_pc + last_node.SequenceLength() - preserve_from; + fixup_length += preserve_length; + // Jumps after the start of the preserved sequence need fixup. + AddJumpSourceFixup(fixup_length, + start_pc + last_node.SequenceLength() - preserve_length); + // All jump targets after the start of the optimized sequence need to be + // fixed relative to the length of the optimized sequence including + // bytecodes we preserved. + AddJumpDestinationFixup(fixup_length, start_pc + 1); + // Jumps to the sequence we preserved need absolute fixup as they could + // occur before or after the sequence. + SetJumpDestinationFixup(pc() - preserve_from, preserve_from); + CopyRangeToOutput(bytecode, preserve_from, preserve_length); + } else { + AddJumpDestinationFixup(fixup_length, start_pc + 1); + // Jumps after the end of the old sequence need fixup. + AddJumpSourceFixup(fixup_length, start_pc + last_node.SequenceLength()); + } + + // Delete jumps we definitely don't need anymore + for (int del : delete_jumps) { + if (del < preserve_from) { + jump_edges_.erase(del); + } + } +} + +void RegExpBytecodePeephole::AddJumpSourceFixup(int fixup, int pos) { + auto previous_fixup = jump_source_fixups_.lower_bound(pos); + DCHECK(previous_fixup != jump_source_fixups_.end()); + DCHECK(previous_fixup != jump_source_fixups_.begin()); + + int previous_fixup_value = (--previous_fixup)->second; + jump_source_fixups_[pos] = previous_fixup_value + fixup; +} + +void RegExpBytecodePeephole::AddJumpDestinationFixup(int fixup, int pos) { + auto previous_fixup = jump_destination_fixups_.lower_bound(pos); + DCHECK(previous_fixup != jump_destination_fixups_.end()); + DCHECK(previous_fixup != jump_destination_fixups_.begin()); + + int previous_fixup_value = (--previous_fixup)->second; + jump_destination_fixups_[pos] = previous_fixup_value + fixup; +} + +void RegExpBytecodePeephole::SetJumpDestinationFixup(int fixup, int pos) { + auto previous_fixup = jump_destination_fixups_.lower_bound(pos); + DCHECK(previous_fixup != jump_destination_fixups_.end()); + DCHECK(previous_fixup != jump_destination_fixups_.begin()); + + int previous_fixup_value = (--previous_fixup)->second; + jump_destination_fixups_.emplace(pos, fixup); + jump_destination_fixups_.emplace(pos + 1, previous_fixup_value); +} + +void RegExpBytecodePeephole::PrepareJumpStructures( + const ZoneUnorderedMap<int, int>& jump_edges) { + for (auto jump_edge : jump_edges) { + int jump_source = jump_edge.first; + int jump_destination = jump_edge.second; + + jump_edges_.emplace(jump_source, jump_destination); + jump_usage_counts_[jump_destination]++; + } +} + +void RegExpBytecodePeephole::FixJumps() { + int position_fixup = 0; + // Next position where fixup changes. + auto next_source_fixup = jump_source_fixups_.lower_bound(0); + int next_source_fixup_offset = next_source_fixup->first; + int next_source_fixup_value = next_source_fixup->second; + + for (auto jump_edge : jump_edges_) { + int jump_source = jump_edge.first; + int jump_destination = jump_edge.second; + while (jump_source >= next_source_fixup_offset) { + position_fixup = next_source_fixup_value; + ++next_source_fixup; + next_source_fixup_offset = next_source_fixup->first; + next_source_fixup_value = next_source_fixup->second; + } + jump_source += position_fixup; + + FixJump(jump_source, jump_destination); + } + + // Mapped jump edges don't need source fixups, as the position already is an + // offset in the new bytecode. + for (auto jump_edge : jump_edges_mapped_) { + int jump_source = jump_edge.first; + int jump_destination = jump_edge.second; + + FixJump(jump_source, jump_destination); + } +} + +void RegExpBytecodePeephole::FixJump(int jump_source, int jump_destination) { + int fixed_jump_destination = + jump_destination + + (--jump_destination_fixups_.upper_bound(jump_destination))->second; + DCHECK_LT(fixed_jump_destination, Length()); +#ifdef DEBUG + // TODO(pthier): This check could be better if we track the bytecodes + // actually used and check if we jump to one of them. + byte jump_bc = optimized_bytecode_buffer_[fixed_jump_destination]; + DCHECK_GT(jump_bc, 0); + DCHECK_LT(jump_bc, kRegExpBytecodeCount); +#endif + + if (jump_destination != fixed_jump_destination) { + OverwriteValue<uint32_t>(jump_source, fixed_jump_destination); + } +} + +void RegExpBytecodePeephole::AddSentinelFixups(int pos) { + jump_source_fixups_.emplace(pos, 0); + jump_destination_fixups_.emplace(pos, 0); +} + +template <typename T> +void RegExpBytecodePeephole::EmitValue(T value) { + DCHECK(optimized_bytecode_buffer_.begin() + pc() == + optimized_bytecode_buffer_.end()); + byte* value_byte_iter = reinterpret_cast<byte*>(&value); + optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), + value_byte_iter, + value_byte_iter + sizeof(T)); +} + +template <typename T> +void RegExpBytecodePeephole::OverwriteValue(int offset, T value) { + byte* value_byte_iter = reinterpret_cast<byte*>(&value); + byte* value_byte_iter_end = value_byte_iter + sizeof(T); + while (value_byte_iter < value_byte_iter_end) { + optimized_bytecode_buffer_[offset++] = *value_byte_iter++; + } +} + +void RegExpBytecodePeephole::CopyRangeToOutput(const byte* orig_bytecode, + int start, int length) { + DCHECK(optimized_bytecode_buffer_.begin() + pc() == + optimized_bytecode_buffer_.end()); + optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), + orig_bytecode + start, + orig_bytecode + start + length); +} + +void RegExpBytecodePeephole::SetRange(byte value, int count) { + DCHECK(optimized_bytecode_buffer_.begin() + pc() == + optimized_bytecode_buffer_.end()); + optimized_bytecode_buffer_.insert(optimized_bytecode_buffer_.end(), count, + value); +} + +void RegExpBytecodePeephole::EmitArgument(int start_pc, const byte* bytecode, + BytecodeArgumentMapping arg) { + int arg_pos = start_pc + arg.offset; + switch (arg.length) { + case 1: + DCHECK_EQ(arg.new_length, arg.length); + EmitValue(GetValue<byte>(bytecode, arg_pos)); + break; + case 2: + DCHECK_EQ(arg.new_length, arg.length); + EmitValue(GetValue<uint16_t>(bytecode, arg_pos)); + break; + case 3: { + // Length 3 only occurs in 'packed' arguments where the lowermost byte is + // the current bytecode, and the remaining 3 bytes are the packed value. + // + // We load 4 bytes from position - 1 and shift out the bytecode. +#ifdef V8_TARGET_BIG_ENDIAN + UNIMPLEMENTED(); + int32_t val = 0; +#else + int32_t val = GetValue<int32_t>(bytecode, arg_pos - 1) >> kBitsPerByte; +#endif // V8_TARGET_BIG_ENDIAN + + switch (arg.new_length) { + case 2: + EmitValue<uint16_t>(val); + break; + case 3: { + // Pack with previously emitted value. + auto prev_val = + GetValue<int32_t>(&(*optimized_bytecode_buffer_.begin()), + Length() - sizeof(uint32_t)); +#ifdef V8_TARGET_BIG_ENDIAN + UNIMPLEMENTED(); + USE(prev_val); +#else + DCHECK_EQ(prev_val & 0xFFFFFF00, 0); + OverwriteValue<uint32_t>( + pc() - sizeof(uint32_t), + (static_cast<uint32_t>(val) << 8) | (prev_val & 0xFF)); +#endif // V8_TARGET_BIG_ENDIAN + break; + } + case 4: + EmitValue<uint32_t>(val); + break; + } + break; + } + case 4: + DCHECK_EQ(arg.new_length, arg.length); + EmitValue(GetValue<uint32_t>(bytecode, arg_pos)); + break; + case 8: + DCHECK_EQ(arg.new_length, arg.length); + EmitValue(GetValue<uint64_t>(bytecode, arg_pos)); + break; + default: + CopyRangeToOutput(bytecode, arg_pos, + std::min(arg.length, arg.new_length)); + if (arg.length < arg.new_length) { + SetRange(0x00, arg.new_length - arg.length); + } + break; + } +} + +int RegExpBytecodePeephole::pc() const { + DCHECK_LE(optimized_bytecode_buffer_.size(), std::numeric_limits<int>::max()); + return static_cast<int>(optimized_bytecode_buffer_.size()); +} + +Zone* RegExpBytecodePeephole::zone() const { return zone_; } + +} // namespace + +// static +Handle<ByteArray> RegExpBytecodePeepholeOptimization::OptimizeBytecode( + Isolate* isolate, Zone* zone, Handle<String> source, const byte* bytecode, + int length, const ZoneUnorderedMap<int, int>& jump_edges) { + RegExpBytecodePeephole peephole(zone, length, jump_edges); + bool did_optimize = peephole.OptimizeBytecode(bytecode, length); + Handle<ByteArray> array = isolate->factory()->NewByteArray(peephole.Length()); + peephole.CopyOptimizedBytecode(array->GetDataStartAddress()); + + if (did_optimize && v8_flags.trace_regexp_peephole_optimization) { + PrintF("Original Bytecode:\n"); + RegExpBytecodeDisassemble(bytecode, length, source->ToCString().get()); + PrintF("Optimized Bytecode:\n"); + RegExpBytecodeDisassemble(array->GetDataStartAddress(), peephole.Length(), + source->ToCString().get()); + } + + return array; +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.h b/js/src/irregexp/imported/regexp-bytecode-peephole.h new file mode 100644 index 0000000000..5b8a0c7b4b --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecode-peephole.h @@ -0,0 +1,30 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_ +#define V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_ + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class ByteArray; + +// Peephole optimization for regexp interpreter bytecode. +// Pre-defined bytecode sequences occuring in the bytecode generated by the +// RegExpBytecodeGenerator can be optimized into a single bytecode. +class RegExpBytecodePeepholeOptimization : public AllStatic { + public: + // Performs peephole optimization on the given bytecode and returns the + // optimized bytecode. + static Handle<ByteArray> OptimizeBytecode( + Isolate* isolate, Zone* zone, Handle<String> source, const byte* bytecode, + int length, const ZoneUnorderedMap<int, int>& jump_edges); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_ diff --git a/js/src/irregexp/imported/regexp-bytecodes.cc b/js/src/irregexp/imported/regexp-bytecodes.cc new file mode 100644 index 0000000000..829bea9180 --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecodes.cc @@ -0,0 +1,46 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-bytecodes.h" + +#include <cctype> + + +namespace v8 { +namespace internal { + +void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc) { + int bytecode = *reinterpret_cast<const int32_t*>(pc) & BYTECODE_MASK; + PrintF("%s", RegExpBytecodeName(bytecode)); + + // Args and the bytecode as hex. + for (int i = 0; i < RegExpBytecodeLength(bytecode); i++) { + PrintF(", %02x", pc[i]); + } + PrintF(" "); + + // Args as ascii. + for (int i = 1; i < RegExpBytecodeLength(bytecode); i++) { + unsigned char b = pc[i]; + PrintF("%c", std::isprint(b) ? b : '.'); + } + PrintF("\n"); +} + +void RegExpBytecodeDisassemble(const byte* code_base, int length, + const char* pattern) { + PrintF("[generated bytecode for regexp pattern: '%s']\n", pattern); + + ptrdiff_t offset = 0; + + while (offset < length) { + const byte* const pc = code_base + offset; + PrintF("%p %4" V8PRIxPTRDIFF " ", pc, offset); + RegExpBytecodeDisassembleSingle(code_base, pc); + offset += RegExpBytecodeLength(*pc); + } +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-bytecodes.h b/js/src/irregexp/imported/regexp-bytecodes.h new file mode 100644 index 0000000000..5602d8d7bc --- /dev/null +++ b/js/src/irregexp/imported/regexp-bytecodes.h @@ -0,0 +1,257 @@ +// Copyright 2011 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_BYTECODES_H_ +#define V8_REGEXP_REGEXP_BYTECODES_H_ + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +// Maximum number of bytecodes that will be used (next power of 2 of actually +// defined bytecodes). +// All slots between the last actually defined bytecode and maximum id will be +// filled with BREAKs, indicating an invalid operation. This way using +// BYTECODE_MASK guarantees no OOB access to the dispatch table. +constexpr int kRegExpPaddedBytecodeCount = 1 << 6; +constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1; +// The first argument is packed in with the byte code in one word, but so it +// has 24 bits, but it can be positive and negative so only use 23 bits for +// positive values. +const unsigned int MAX_FIRST_ARG = 0x7fffffu; +const int BYTECODE_SHIFT = 8; +static_assert(1 << BYTECODE_SHIFT > BYTECODE_MASK); + +// The list of bytecodes, in format: V(Name, Code, ByteLength). +// TODO(pthier): Argument offsets of bytecodes should be easily accessible by +// name or at least by position. +// TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32). +#define BYTECODE_ITERATOR(V) \ + V(BREAK, 0, 4) /* bc8 */ \ + V(PUSH_CP, 1, 4) /* bc8 pad24 */ \ + V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \ + V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \ + V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \ + V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \ + V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \ + V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \ + V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \ + V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \ + V(POP_CP, 10, 4) /* bc8 pad24 */ \ + V(POP_BT, 11, 4) /* bc8 pad24 */ \ + V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \ + V(FAIL, 13, 4) /* bc8 pad24 */ \ + V(SUCCEED, 14, 4) /* bc8 pad24 */ \ + V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \ + /* Jump to another bytecode given its offset. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x10 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: 0x00 (unused) Padding */ \ + /* 0x20 - 0x3F: Address of bytecode to jump to */ \ + V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \ + /* Check if offset is in range and load character at given offset. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x11 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: Offset from current position */ \ + /* 0x20 - 0x3F: Address of bytecode when load is out of range */ \ + V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \ + /* Load character at given offset without range checks. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x12 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: Offset from current position */ \ + V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \ + V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \ + V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \ + V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \ + V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \ + V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \ + /* Check if current character is equal to a given character */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x19 (fixed) Bytecode */ \ + /* 0x08 - 0x0F: 0x00 (unused) Padding */ \ + /* 0x10 - 0x1F: Character to check */ \ + /* 0x20 - 0x3F: Address of bytecode when matched */ \ + V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \ + V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \ + V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \ + V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ + /* Checks if the current character combined with mask (bitwise and) */ \ + /* matches a character (e.g. used when two characters in a disjunction */ \ + /* differ by only a single bit */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x1c (fixed) Bytecode */ \ + /* 0x08 - 0x0F: 0x00 (unused) Padding */ \ + /* 0x10 - 0x1F: Character to match against (after mask aplied) */ \ + /* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \ + /* 0x40 - 0x5F: Address of bytecode when matched */ \ + V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ + V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ + V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ + V(MINUS_AND_CHECK_NOT_CHAR, 31, \ + 12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */ \ + V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \ + V(CHECK_CHAR_NOT_IN_RANGE, 33, \ + 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \ + /* Checks if the current character matches any of the characters encoded */ \ + /* in a bit table. Similar to/inspired by boyer moore string search */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x22 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: 0x00 (unused) Padding */ \ + /* 0x20 - 0x3F: Address of bytecode when bit is set */ \ + /* 0x40 - 0xBF: Bit table */ \ + V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ + V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32 */ \ + V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32 */ \ + V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \ + V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \ + V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ + V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ + V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ + V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \ + V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \ + /* Checks if the current position matches top of backtrack stack */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x31 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: 0x00 (unused) Padding */ \ + /* 0x20 - 0x3F: Address of bytecode when current matches tos */ \ + V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \ + /* Advance character pointer by given offset and jump to another bytecode.*/ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x32 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: Number of characters to advance */ \ + /* 0x20 - 0x3F: Address of bytecode to jump to */ \ + V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \ + V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ \ + /* Checks if current position + given offset is in range. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07: 0x34 (fixed) Bytecode */ \ + /* 0x08 - 0x1F: Offset from current position */ \ + /* 0x20 - 0x3F: Address of bytecode when position is out of range */ \ + V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32 */ \ + /* Combination of: */ \ + /* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO */ \ + /* Emitted by RegExpBytecodePeepholeOptimization. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07 0x35 (fixed) Bytecode */ \ + /* 0x08 - 0x1F Load character offset from current position */ \ + /* 0x20 - 0x3F Number of characters to advance */ \ + /* 0x40 - 0xBF Bit Table */ \ + /* 0xC0 - 0xDF Address of bytecode when character is matched */ \ + /* 0xE0 - 0xFF Address of bytecode when no match */ \ + V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32) \ + /* Combination of: */ \ + /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR */ \ + /* and ADVANCE_CP_AND_GOTO */ \ + /* Emitted by RegExpBytecodePeepholeOptimization. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07 0x36 (fixed) Bytecode */ \ + /* 0x08 - 0x1F Load character offset from current position */ \ + /* 0x20 - 0x2F Number of characters to advance */ \ + /* 0x30 - 0x3F Character to match against (after mask applied) */ \ + /* 0x40 - 0x5F: Bitmask bitwise and combined with current character */ \ + /* 0x60 - 0x7F Minimum number of characters this pattern consumes */ \ + /* 0x80 - 0x9F Address of bytecode when character is matched */ \ + /* 0xA0 - 0xBF Address of bytecode when no match */ \ + V(SKIP_UNTIL_CHAR_AND, 54, 24) \ + /* Combination of: */ \ + /* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \ + /* Emitted by RegExpBytecodePeepholeOptimization. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07 0x37 (fixed) Bytecode */ \ + /* 0x08 - 0x1F Load character offset from current position */ \ + /* 0x20 - 0x2F Number of characters to advance */ \ + /* 0x30 - 0x3F Character to match */ \ + /* 0x40 - 0x5F Address of bytecode when character is matched */ \ + /* 0x60 - 0x7F Address of bytecode when no match */ \ + V(SKIP_UNTIL_CHAR, 55, 16) \ + /* Combination of: */ \ + /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR */ \ + /* and ADVANCE_CP_AND_GOTO */ \ + /* Emitted by RegExpBytecodePeepholeOptimization. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07 0x38 (fixed) Bytecode */ \ + /* 0x08 - 0x1F Load character offset from current position */ \ + /* 0x20 - 0x2F Number of characters to advance */ \ + /* 0x30 - 0x3F Character to match */ \ + /* 0x40 - 0x5F Minimum number of characters this pattern consumes */ \ + /* 0x60 - 0x7F Address of bytecode when character is matched */ \ + /* 0x80 - 0x9F Address of bytecode when no match */ \ + V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20) \ + /* Combination of: */ \ + /* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \ + /* Emitted by RegExpBytecodePeepholeOptimization. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07 0x39 (fixed) Bytecode */ \ + /* 0x08 - 0x1F Load character offset from current position */ \ + /* 0x20 - 0x3F Number of characters to advance */ \ + /* 0x40 - 0x4F Character to match */ \ + /* 0x50 - 0x5F Other Character to match */ \ + /* 0x60 - 0x7F Address of bytecode when either character is matched */ \ + /* 0x80 - 0x9F Address of bytecode when no match */ \ + V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20) \ + /* Combination of: */ \ + /* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and */ \ + /* and ADVANCE_CP_AND_GOTO */ \ + /* Emitted by RegExpBytecodePeepholeOptimization. */ \ + /* Bit Layout: */ \ + /* 0x00 - 0x07 0x3A (fixed) Bytecode */ \ + /* 0x08 - 0x1F Load character offset from current position */ \ + /* 0x20 - 0x2F Number of characters to advance */ \ + /* 0x30 - 0x3F Character to check if it is less than current char */ \ + /* 0x40 - 0xBF Bit Table */ \ + /* 0xC0 - 0xDF Address of bytecode when character is matched */ \ + /* 0xE0 - 0xFF Address of bytecode when no match */ \ + V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32) + +#define COUNT(...) +1 +static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT); +#undef COUNT + +// Just making sure we assigned values above properly. They should be +// contiguous, strictly increasing, and start at 0. +// TODO(jgruber): Do not explicitly assign values, instead generate them +// implicitly from the list order. +static_assert(kRegExpBytecodeCount == 59); + +#define DECLARE_BYTECODES(name, code, length) \ + static constexpr int BC_##name = code; +BYTECODE_ITERATOR(DECLARE_BYTECODES) +#undef DECLARE_BYTECODES + +static constexpr int kRegExpBytecodeLengths[] = { +#define DECLARE_BYTECODE_LENGTH(name, code, length) length, + BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH) +#undef DECLARE_BYTECODE_LENGTH +}; + +inline constexpr int RegExpBytecodeLength(int bytecode) { + DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1)); + return kRegExpBytecodeLengths[bytecode]; +} + +static constexpr const char* const kRegExpBytecodeNames[] = { +#define DECLARE_BYTECODE_NAME(name, ...) #name, + BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME) +#undef DECLARE_BYTECODE_NAME +}; + +inline constexpr const char* RegExpBytecodeName(int bytecode) { + DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1)); + return kRegExpBytecodeNames[bytecode]; +} + +void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc); +void RegExpBytecodeDisassemble(const byte* code_base, int length, + const char* pattern); + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_BYTECODES_H_ diff --git a/js/src/irregexp/imported/regexp-compiler-tonode.cc b/js/src/irregexp/imported/regexp-compiler-tonode.cc new file mode 100644 index 0000000000..8dc7ed629a --- /dev/null +++ b/js/src/irregexp/imported/regexp-compiler-tonode.cc @@ -0,0 +1,2042 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-compiler.h" + +#include "irregexp/imported/regexp.h" + +#ifdef V8_INTL_SUPPORT +#include "irregexp/imported/special-case.h" +#include "unicode/locid.h" +#include "unicode/uniset.h" +#include "unicode/utypes.h" +#endif // V8_INTL_SUPPORT + +namespace v8 { +namespace internal { + +using namespace regexp_compiler_constants; // NOLINT(build/namespaces) + +constexpr base::uc32 kMaxCodePoint = 0x10ffff; +constexpr int kMaxUtf16CodeUnit = 0xffff; +constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff; +constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; + +// ------------------------------------------------------------------- +// Tree to graph conversion + +RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + ZoneList<TextElement>* elms = + compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone()); + elms->Add(TextElement::Atom(this), compiler->zone()); + return compiler->zone()->New<TextNode>(elms, compiler->read_backward(), + on_success); +} + +RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(), + on_success); +} + +namespace { + +bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, + const int* special_class, int length) { + length--; // Remove final marker. + + DCHECK_EQ(kRangeEndMarker, special_class[length]); + DCHECK_NE(0, ranges->length()); + DCHECK_NE(0, length); + DCHECK_NE(0, special_class[0]); + + if (ranges->length() != (length >> 1) + 1) return false; + + CharacterRange range = ranges->at(0); + if (range.from() != 0) return false; + + for (int i = 0; i < length; i += 2) { + if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) { + return false; + } + range = ranges->at((i >> 1) + 1); + if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) { + return false; + } + } + + return range.to() == kMaxCodePoint; +} + +bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class, + int length) { + length--; // Remove final marker. + + DCHECK_EQ(kRangeEndMarker, special_class[length]); + if (ranges->length() * 2 != length) return false; + + for (int i = 0; i < length; i += 2) { + CharacterRange range = ranges->at(i >> 1); + if (range.from() != static_cast<base::uc32>(special_class[i]) || + range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) { + return false; + } + } + return true; +} + +} // namespace + +bool RegExpClassRanges::is_standard(Zone* zone) { + // TODO(lrn): Remove need for this function, by not throwing away information + // along the way. + if (is_negated()) { + return false; + } + if (set_.is_standard()) { + return true; + } + if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { + set_.set_standard_set_type(StandardCharacterSet::kWhitespace); + return true; + } + if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { + set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace); + return true; + } + if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, + kLineTerminatorRangeCount)) { + set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator); + return true; + } + if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, + kLineTerminatorRangeCount)) { + set_.set_standard_set_type(StandardCharacterSet::kLineTerminator); + return true; + } + if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { + set_.set_standard_set_type(StandardCharacterSet::kWord); + return true; + } + if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { + set_.set_standard_set_type(StandardCharacterSet::kNotWord); + return true; + } + return false; +} + +UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) { + // The unicode range splitter categorizes given character ranges into: + // - Code points from the BMP representable by one code unit. + // - Code points outside the BMP that need to be split into + // surrogate pairs. + // - Lone lead surrogates. + // - Lone trail surrogates. + // Lone surrogates are valid code points, even though no actual characters. + // They require special matching to make sure we do not split surrogate pairs. + + for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); +} + +void UnicodeRangeSplitter::AddRange(CharacterRange range) { + static constexpr base::uc32 kBmp1Start = 0; + static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1; + static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1; + static constexpr base::uc32 kBmp2End = kNonBmpStart - 1; + + // Ends are all inclusive. + static_assert(kBmp1Start == 0); + static_assert(kBmp1Start < kBmp1End); + static_assert(kBmp1End + 1 == kLeadSurrogateStart); + static_assert(kLeadSurrogateStart < kLeadSurrogateEnd); + static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); + static_assert(kTrailSurrogateStart < kTrailSurrogateEnd); + static_assert(kTrailSurrogateEnd + 1 == kBmp2Start); + static_assert(kBmp2Start < kBmp2End); + static_assert(kBmp2End + 1 == kNonBmpStart); + static_assert(kNonBmpStart < kNonBmpEnd); + + static constexpr base::uc32 kStarts[] = { + kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, + kBmp2Start, kNonBmpStart, + }; + + static constexpr base::uc32 kEnds[] = { + kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, + }; + + CharacterRangeVector* const kTargets[] = { + &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, + }; + + static constexpr int kCount = arraysize(kStarts); + static_assert(kCount == arraysize(kEnds)); + static_assert(kCount == arraysize(kTargets)); + + for (int i = 0; i < kCount; i++) { + if (kStarts[i] > range.to()) break; + const base::uc32 from = std::max(kStarts[i], range.from()); + const base::uc32 to = std::min(kEnds[i], range.to()); + if (from > to) continue; + kTargets[i]->emplace_back(CharacterRange::Range(from, to)); + } +} + +namespace { + +// Translates between new and old V8-isms (SmallVector, ZoneList). +ZoneList<CharacterRange>* ToCanonicalZoneList( + const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { + if (v->empty()) return nullptr; + + ZoneList<CharacterRange>* result = + zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone); + for (size_t i = 0; i < v->size(); i++) { + result->Add(v->at(i), zone); + } + + CharacterRange::Canonicalize(result); + return result; +} + +void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, UnicodeRangeSplitter* splitter) { + ZoneList<CharacterRange>* bmp = + ToCanonicalZoneList(splitter->bmp(), compiler->zone()); + if (bmp == nullptr) return; + result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( + compiler->zone(), bmp, compiler->read_backward(), on_success))); +} + +using UC16Range = uint32_t; // {from, to} packed into one uint32_t. +constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) { + return (static_cast<uint32_t>(from) << 16) | to; +} +constexpr base::uc16 ExtractFrom(UC16Range r) { + return static_cast<base::uc16>(r >> 16); +} +constexpr base::uc16 ExtractTo(UC16Range r) { + return static_cast<base::uc16>(r); +} + +void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + DCHECK(!compiler->one_byte()); + Zone* const zone = compiler->zone(); + ZoneList<CharacterRange>* non_bmp = + ToCanonicalZoneList(splitter->non_bmp(), zone); + if (non_bmp == nullptr) return; + + // Translate each 32-bit code point range into the corresponding 16-bit code + // unit representation consisting of the lead- and trail surrogate. + // + // The generated alternatives are grouped by the leading surrogate to avoid + // emitting excessive code. For example, for + // + // { \ud800[\udc00-\udc01] + // , \ud800[\udc05-\udc06] + // } + // + // there's no need to emit matching code for the leading surrogate \ud800 + // twice. We also create a dedicated grouping for full trailing ranges, i.e. + // [dc00-dfff]. + ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading( + zone); + ZoneList<CharacterRange>* leading_with_full_trailing_range = + zone->New<ZoneList<CharacterRange>>(1, zone); + const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l, + base::uc16 from_t, base::uc16 to_t) { + const UC16Range leading_range = ToUC16Range(from_l, to_l); + if (grouped_by_leading.count(leading_range) == 0) { + if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) { + leading_with_full_trailing_range->Add( + CharacterRange::Range(from_l, to_l), zone); + return; + } + grouped_by_leading[leading_range] = + zone->New<ZoneList<CharacterRange>>(2, zone); + } + grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t), + zone); + }; + + // First, create the grouped ranges. + CharacterRange::Canonicalize(non_bmp); + for (int i = 0; i < non_bmp->length(); i++) { + // Match surrogate pair. + // E.g. [\u10005-\u11005] becomes + // \ud800[\udc05-\udfff]| + // [\ud801-\ud803][\udc00-\udfff]| + // \ud804[\udc00-\udc05] + base::uc32 from = non_bmp->at(i).from(); + base::uc32 to = non_bmp->at(i).to(); + base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from); + base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from); + base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to); + base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to); + + if (from_l == to_l) { + // The lead surrogate is the same. + AddRange(from_l, to_l, from_t, to_t); + continue; + } + + if (from_t != kTrailSurrogateStart) { + // Add [from_l][from_t-\udfff]. + AddRange(from_l, from_l, from_t, kTrailSurrogateEnd); + from_l++; + } + if (to_t != kTrailSurrogateEnd) { + // Add [to_l][\udc00-to_t]. + AddRange(to_l, to_l, kTrailSurrogateStart, to_t); + to_l--; + } + if (from_l <= to_l) { + // Add [from_l-to_l][\udc00-\udfff]. + AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd); + } + } + + // Create the actual TextNode now that ranges are fully grouped. + if (!leading_with_full_trailing_range->is_empty()) { + CharacterRange::Canonicalize(leading_with_full_trailing_range); + result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, leading_with_full_trailing_range, + CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), + compiler->read_backward(), on_success))); + } + for (const auto& it : grouped_by_leading) { + CharacterRange leading_range = + CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first)); + ZoneList<CharacterRange>* trailing_ranges = it.second; + CharacterRange::Canonicalize(trailing_ranges); + result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, leading_range, trailing_ranges, compiler->read_backward(), + on_success))); + } +} + +RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( + RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, + ZoneList<CharacterRange>* match, RegExpNode* on_success, + bool read_backward) { + Zone* zone = compiler->zone(); + RegExpNode* match_node = TextNode::CreateForCharacterRanges( + zone, match, read_backward, on_success); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpLookaround::Builder lookaround(false, match_node, stack_register, + position_register); + RegExpNode* negative_match = TextNode::CreateForCharacterRanges( + zone, lookbehind, !read_backward, lookaround.on_match_success()); + return lookaround.ForMatch(negative_match); +} + +RegExpNode* MatchAndNegativeLookaroundInReadDirection( + RegExpCompiler* compiler, ZoneList<CharacterRange>* match, + ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, + bool read_backward) { + Zone* zone = compiler->zone(); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpLookaround::Builder lookaround(false, on_success, stack_register, + position_register); + RegExpNode* negative_match = TextNode::CreateForCharacterRanges( + zone, lookahead, read_backward, lookaround.on_match_success()); + return TextNode::CreateForCharacterRanges( + zone, match, read_backward, lookaround.ForMatch(negative_match)); +} + +void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + ZoneList<CharacterRange>* lead_surrogates = + ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); + if (lead_surrogates == nullptr) return; + Zone* zone = compiler->zone(); + // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). + ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); + + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Assert that reading forward, there is no trail + // surrogate, and then backward match the lead surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, trail_surrogates, lead_surrogates, on_success, true); + } else { + // Reading forward. Forward match the lead surrogate and assert that + // no trail surrogate follows. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, lead_surrogates, trail_surrogates, on_success, false); + } + result->AddAlternative(GuardedAlternative(match)); +} + +void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + ZoneList<CharacterRange>* trail_surrogates = + ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); + if (trail_surrogates == nullptr) return; + Zone* zone = compiler->zone(); + // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 + ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); + + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Backward match the trail surrogate and assert that no + // lead surrogate precedes it. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, trail_surrogates, lead_surrogates, on_success, true); + } else { + // Reading forward. Assert that reading backward, there is no lead + // surrogate, and then forward match the trail surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, lead_surrogates, trail_surrogates, on_success, false); + } + result->AddAlternative(GuardedAlternative(match)); +} + +RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, + RegExpNode* on_success) { + // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. + DCHECK(!compiler->read_backward()); + Zone* zone = compiler->zone(); + // Advance any character. If the character happens to be a lead surrogate and + // we advanced into the middle of a surrogate pair, it will work out, as + // nothing will match from there. We will have to advance again, consuming + // the associated trail surrogate. + ZoneList<CharacterRange>* range = + CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit)); + return TextNode::CreateForCharacterRanges(zone, range, false, on_success); +} + +} // namespace + +#ifdef V8_INTL_SUPPORT +// static +void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) { + // Remove characters for which closeOver() adds full-case-folding equivalents + // because we should work only with simple case folding mappings. + icu::UnicodeSet non_simple = icu::UnicodeSet(set); + non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet()); + set.removeAll(non_simple); + + set.closeOver(USET_CASE_INSENSITIVE); + // Full case folding maps single characters to multiple characters. + // Those are represented as strings in the set. Remove them so that + // we end up with only simple and common case mappings. + set.removeAllStrings(); + + // Add characters that have non-simple case foldings again (they match + // themselves). + set.addAll(non_simple); +} +#endif // V8_INTL_SUPPORT + +// static +void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, + Zone* zone) { +#ifdef V8_INTL_SUPPORT + DCHECK(IsCanonical(ranges)); + + // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. + // See also https://crbug.com/v8/6727. + // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, + // which we use frequently internally. But large ranges can also easily be + // created by the user. We might want to have a more general caching mechanism + // for such ranges. + if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; + + // Use ICU to compute the case fold closure over the ranges. + icu::UnicodeSet set; + for (int i = 0; i < ranges->length(); i++) { + set.add(ranges->at(i).from(), ranges->at(i).to()); + } + // Clear the ranges list without freeing the backing store. + ranges->Rewind(0); + + UnicodeSimpleCloseOver(set); + for (int i = 0; i < set.getRangeCount(); i++) { + ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone); + } + // No errors and everything we collected have been ranges. + Canonicalize(ranges); +#endif // V8_INTL_SUPPORT +} + +RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + set_.Canonicalize(); + Zone* const zone = compiler->zone(); + ZoneList<CharacterRange>* ranges = this->ranges(zone); + + if (NeedsUnicodeCaseEquivalents(compiler->flags())) { + CharacterRange::AddUnicodeCaseEquivalents(ranges, zone); + } + + if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() || + contains_split_surrogate()) { + return zone->New<TextNode>(this, compiler->read_backward(), on_success); + } + + if (is_negated()) { + // With /v, character classes are never negated. + // TODO(v8:11935): Change permalink once proposal is in stage 4. + // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-compileatom + // Atom :: CharacterClass + // 4. Assert: cc.[[Invert]] is false. + // Instead the complement is created when evaluating the class set. + // The only exception is the "nothing range" (negated everything), which is + // internally created for an empty set. + DCHECK_IMPLIES( + IsUnicodeSets(compiler->flags()), + ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint)); + ZoneList<CharacterRange>* negated = + zone->New<ZoneList<CharacterRange>>(2, zone); + CharacterRange::Negate(ranges, negated, zone); + ranges = negated; + } + + if (ranges->length() == 0) { + // The empty character class is used as a 'fail' node. + RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges); + return zone->New<TextNode>(fail, compiler->read_backward(), on_success); + } + + if (set_.is_standard() && + standard_type() == StandardCharacterSet::kEverything) { + return UnanchoredAdvance(compiler, on_success); + } + + // Split ranges in order to handle surrogates correctly: + // - Surrogate pairs: translate the 32-bit code point into two uc16 code + // units (irregexp operates only on code units). + // - Lone surrogates: these require lookarounds to ensure we don't match in + // the middle of a surrogate pair. + ChoiceNode* result = zone->New<ChoiceNode>(2, zone); + UnicodeRangeSplitter splitter(ranges); + AddBmpCharacters(compiler, result, on_success, &splitter); + AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); + AddLoneLeadSurrogates(compiler, result, on_success, &splitter); + AddLoneTrailSurrogates(compiler, result, on_success, &splitter); + + static constexpr int kMaxRangesToInline = 32; // Arbitrary. + if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline(); + + return result; +} + +RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + Zone* zone = compiler->zone(); + const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) + + (ranges()->is_empty() ? 0 : 1); + if (size == 0) { + // If neither ranges nor strings are present, the operand is equal to an + // empty range (matching nothing). + ZoneList<CharacterRange>* empty = + zone->template New<ZoneList<CharacterRange>>(0, zone); + return zone->template New<RegExpClassRanges>(zone, empty) + ->ToNode(compiler, on_success); + } + ZoneList<RegExpTree*>* alternatives = + zone->template New<ZoneList<RegExpTree*>>(size, zone); + // Strings are sorted by length first (larger strings before shorter ones). + // See the comment on CharacterClassStrings. + // Empty strings (if present) are added after character ranges. + RegExpTree* empty_string = nullptr; + if (has_strings()) { + for (auto string : *strings()) { + if (string.second->IsEmpty()) { + empty_string = string.second; + } else { + alternatives->Add(string.second, zone); + } + } + } + if (!ranges()->is_empty()) { + alternatives->Add(zone->template New<RegExpClassRanges>(zone, ranges()), + zone); + } + if (empty_string != nullptr) { + alternatives->Add(empty_string, zone); + } + + RegExpTree* node = nullptr; + if (size == 1) { + DCHECK_EQ(alternatives->length(), 1); + node = alternatives->first(); + } else { + node = zone->template New<RegExpDisjunction>(alternatives); + } + return node->ToNode(compiler, on_success); +} + +RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + Zone* zone = compiler->zone(); + ZoneList<CharacterRange>* temp_ranges = + zone->template New<ZoneList<CharacterRange>>(4, zone); + RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone); + return root->ToNode(compiler, on_success); +} + +void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) { + ranges()->AddAll(*other->ranges(), zone); + if (other->has_strings()) { + if (strings_ == nullptr) { + strings_ = zone->template New<CharacterClassStrings>(zone); + } + strings()->insert(other->strings()->begin(), other->strings()->end()); + } +} + +void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other, + ZoneList<CharacterRange>* temp_ranges, + Zone* zone) { + CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone); + std::swap(*ranges(), *temp_ranges); + temp_ranges->Rewind(0); + if (has_strings()) { + if (!other->has_strings()) { + strings()->clear(); + } else { + for (auto iter = strings()->begin(); iter != strings()->end();) { + if (other->strings()->find(iter->first) == other->strings()->end()) { + iter = strings()->erase(iter); + } else { + iter++; + } + } + } + } +} + +void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other, + ZoneList<CharacterRange>* temp_ranges, + Zone* zone) { + CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone); + std::swap(*ranges(), *temp_ranges); + temp_ranges->Rewind(0); + if (has_strings() && other->has_strings()) { + for (auto iter = strings()->begin(); iter != strings()->end();) { + if (other->strings()->find(iter->first) != other->strings()->end()) { + iter = strings()->erase(iter); + } else { + iter++; + } + } + } +} + +// static +RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression( + RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) { + DCHECK(temp_ranges->is_empty()); + if (root->IsClassSetOperand()) { + return root->AsClassSetOperand(); + } + DCHECK(root->IsClassSetExpression()); + RegExpClassSetExpression* node = root->AsClassSetExpression(); + RegExpClassSetOperand* result = + ComputeExpression(node->operands()->at(0), temp_ranges, zone); + switch (node->operation()) { + case OperationType::kUnion: { + for (int i = 1; i < node->operands()->length(); i++) { + RegExpClassSetOperand* op = + ComputeExpression(node->operands()->at(i), temp_ranges, zone); + result->Union(op, zone); + } + CharacterRange::Canonicalize(result->ranges()); + break; + } + case OperationType::kIntersection: { + for (int i = 1; i < node->operands()->length(); i++) { + RegExpClassSetOperand* op = + ComputeExpression(node->operands()->at(i), temp_ranges, zone); + result->Intersect(op, temp_ranges, zone); + } + break; + } + case OperationType::kSubtraction: { + for (int i = 1; i < node->operands()->length(); i++) { + RegExpClassSetOperand* op = + ComputeExpression(node->operands()->at(i), temp_ranges, zone); + result->Subtract(op, temp_ranges, zone); + } + break; + } + } + if (node->is_negated()) { + DCHECK(!result->has_strings()); + CharacterRange::Negate(result->ranges(), temp_ranges, zone); + std::swap(*result->ranges(), *temp_ranges); + temp_ranges->Rewind(0); + } + // Store the result as single operand of the current node. + node->operands()->Set(0, result); + node->operands()->Rewind(1); + + return result; +} + +namespace { + +int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + base::uc16 character1 = atom1->data().at(0); + base::uc16 character2 = atom2->data().at(0); + if (character1 < character2) return -1; + if (character1 > character2) return 1; + return 0; +} + +#ifdef V8_INTL_SUPPORT + +int CompareCaseInsensitive(const icu::UnicodeString& a, + const icu::UnicodeString& b) { + return a.caseCompare(b, U_FOLD_CASE_DEFAULT); +} + +int CompareFirstCharCaseInsensitive(RegExpTree* const* a, + RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)}, + icu::UnicodeString{atom2->data().at(0)}); +} + +bool Equals(bool ignore_case, const icu::UnicodeString& a, + const icu::UnicodeString& b) { + if (a == b) return true; + if (ignore_case) return CompareCaseInsensitive(a, b) == 0; + return false; // Case-sensitive equality already checked above. +} + +bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a, + const RegExpAtom* b) { + return Equals(ignore_case, a->data().at(index), b->data().at(index)); +} + +#else + +unibrow::uchar Canonical( + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + unibrow::uchar c) { + unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; + int length = canonicalize->get(c, '\0', chars); + DCHECK_LE(length, 1); + unibrow::uchar canonical = c; + if (length == 1) canonical = chars[0]; + return canonical; +} + +int CompareCaseInsensitive( + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + unibrow::uchar a, unibrow::uchar b) { + if (a == b) return 0; + if (a >= 'a' || b >= 'a') { + a = Canonical(canonicalize, a); + b = Canonical(canonicalize, b); + } + return static_cast<int>(a) - static_cast<int>(b); +} + +int CompareFirstCharCaseInsensitive( + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + return CompareCaseInsensitive(canonicalize, atom1->data().at(0), + atom2->data().at(0)); +} + +bool Equals(bool ignore_case, + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + unibrow::uchar a, unibrow::uchar b) { + if (a == b) return true; + if (ignore_case) { + return CompareCaseInsensitive(canonicalize, a, b) == 0; + } + return false; // Case-sensitive equality already checked above. +} + +bool CharAtEquals(bool ignore_case, + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + int index, const RegExpAtom* a, const RegExpAtom* b) { + return Equals(ignore_case, canonicalize, a->data().at(index), + b->data().at(index)); +} + +#endif // V8_INTL_SUPPORT + +} // namespace + +// We can stable sort runs of atoms, since the order does not matter if they +// start with different characters. +// Returns true if any consecutive atoms were found. +bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + int length = alternatives->length(); + bool found_consecutive_atoms = false; + for (int i = 0; i < length; i++) { + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (alternative->IsAtom()) break; + i++; + } + // i is length or it is the index of an atom. + if (i == length) break; + int first_atom = i; + i++; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + i++; + } + // Sort atoms to get ones with common prefixes together. + // This step is more tricky if we are in a case-independent regexp, + // because it would change /is|I/ to /I|is/, and order matters when + // the regexp parts don't match only disjoint starting points. To fix + // this we have a version of CompareFirstChar that uses case- + // independent character classes for comparison. + DCHECK_LT(first_atom, alternatives->length()); + DCHECK_LE(i, alternatives->length()); + DCHECK_LE(first_atom, i); + if (IsIgnoreCase(compiler->flags())) { +#ifdef V8_INTL_SUPPORT + alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom, + i - first_atom); +#else + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); + auto compare_closure = [canonicalize](RegExpTree* const* a, + RegExpTree* const* b) { + return CompareFirstCharCaseInsensitive(canonicalize, a, b); + }; + alternatives->StableSort(compare_closure, first_atom, i - first_atom); +#endif // V8_INTL_SUPPORT + } else { + alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); + } + if (i - first_atom > 1) found_consecutive_atoms = true; + } + return found_consecutive_atoms; +} + +// Optimizes ab|ac|az to a(?:b|c|d). +void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { + Zone* zone = compiler->zone(); + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + int length = alternatives->length(); + const bool ignore_case = IsIgnoreCase(compiler->flags()); + + int write_posn = 0; + int i = 0; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + RegExpAtom* const atom = alternative->AsAtom(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeString common_prefix(atom->data().at(0)); +#else + unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); + unibrow::uchar common_prefix = atom->data().at(0); + if (ignore_case) { + common_prefix = Canonical(canonicalize, common_prefix); + } +#endif // V8_INTL_SUPPORT + int first_with_prefix = i; + int prefix_length = atom->length(); + i++; + while (i < length) { + alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + RegExpAtom* const alt_atom = alternative->AsAtom(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeString new_prefix(alt_atom->data().at(0)); + if (!Equals(ignore_case, new_prefix, common_prefix)) break; +#else + unibrow::uchar new_prefix = alt_atom->data().at(0); + if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break; +#endif // V8_INTL_SUPPORT + prefix_length = std::min(prefix_length, alt_atom->length()); + i++; + } + if (i > first_with_prefix + 2) { + // Found worthwhile run of alternatives with common prefix of at least one + // character. The sorting function above did not sort on more than one + // character for reasons of correctness, but there may still be a longer + // common prefix if the terms were similar or presorted in the input. + // Find out how long the common prefix is. + int run_length = i - first_with_prefix; + RegExpAtom* const alt_atom = + alternatives->at(first_with_prefix)->AsAtom(); + for (int j = 1; j < run_length && prefix_length > 1; j++) { + RegExpAtom* old_atom = + alternatives->at(j + first_with_prefix)->AsAtom(); + for (int k = 1; k < prefix_length; k++) { +#ifdef V8_INTL_SUPPORT + if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) { +#else + if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) { +#endif // V8_INTL_SUPPORT + prefix_length = k; + break; + } + } + } + RegExpAtom* prefix = + zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length)); + ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone); + pair->Add(prefix, zone); + ZoneList<RegExpTree*>* suffixes = + zone->New<ZoneList<RegExpTree*>>(run_length, zone); + for (int j = 0; j < run_length; j++) { + RegExpAtom* old_atom = + alternatives->at(j + first_with_prefix)->AsAtom(); + int len = old_atom->length(); + if (len == prefix_length) { + suffixes->Add(zone->New<RegExpEmpty>(), zone); + } else { + RegExpTree* suffix = zone->New<RegExpAtom>( + old_atom->data().SubVector(prefix_length, old_atom->length())); + suffixes->Add(suffix, zone); + } + } + pair->Add(zone->New<RegExpDisjunction>(suffixes), zone); + alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair); + } else { + // Just copy any non-worthwhile alternatives. + for (int j = first_with_prefix; j < i; j++) { + alternatives->at(write_posn++) = alternatives->at(j); + } + } + } + alternatives->Rewind(write_posn); // Trim end of array. +} + +// Optimizes b|c|z to [bcz]. +void RegExpDisjunction::FixSingleCharacterDisjunctions( + RegExpCompiler* compiler) { + Zone* zone = compiler->zone(); + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + int length = alternatives->length(); + + int write_posn = 0; + int i = 0; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + RegExpAtom* const atom = alternative->AsAtom(); + if (atom->length() != 1) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + const RegExpFlags flags = compiler->flags(); + DCHECK_IMPLIES(IsEitherUnicode(flags), + !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); + bool contains_trail_surrogate = + unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); + int first_in_run = i; + i++; + // Find a run of single-character atom alternatives that have identical + // flags (case independence and unicode-ness). + while (i < length) { + alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + RegExpAtom* const alt_atom = alternative->AsAtom(); + if (alt_atom->length() != 1) break; + DCHECK_IMPLIES(IsEitherUnicode(flags), + !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0))); + contains_trail_surrogate |= + unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0)); + i++; + } + if (i > first_in_run + 1) { + // Found non-trivial run of single-character alternatives. + int run_length = i - first_in_run; + ZoneList<CharacterRange>* ranges = + zone->New<ZoneList<CharacterRange>>(2, zone); + for (int j = 0; j < run_length; j++) { + RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); + DCHECK_EQ(old_atom->length(), 1); + ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); + } + RegExpClassRanges::ClassRangesFlags class_ranges_flags; + if (IsEitherUnicode(flags) && contains_trail_surrogate) { + class_ranges_flags = RegExpClassRanges::CONTAINS_SPLIT_SURROGATE; + } + alternatives->at(write_posn++) = + zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags); + } else { + // Just copy any trivial alternatives. + for (int j = first_in_run; j < i; j++) { + alternatives->at(write_posn++) = alternatives->at(j); + } + } + } + alternatives->Rewind(write_posn); // Trim end of array. +} + +RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + compiler->ToNodeMaybeCheckForStackOverflow(); + + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + + if (alternatives->length() > 2) { + bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); + if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); + FixSingleCharacterDisjunctions(compiler); + if (alternatives->length() == 1) { + return alternatives->at(0)->ToNode(compiler, on_success); + } + } + + int length = alternatives->length(); + + ChoiceNode* result = + compiler->zone()->New<ChoiceNode>(length, compiler->zone()); + for (int i = 0; i < length; i++) { + GuardedAlternative alternative( + alternatives->at(i)->ToNode(compiler, on_success)); + result->AddAlternative(alternative); + } + return result; +} + +RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return ToNode(min(), max(), is_greedy(), body(), compiler, on_success); +} + +namespace { +// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and +// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) +RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpAssertion::Type type, + RegExpFlags flags) { + CHECK(NeedsUnicodeCaseEquivalents(flags)); + Zone* zone = compiler->zone(); + ZoneList<CharacterRange>* word_range = + zone->New<ZoneList<CharacterRange>>(2, zone); + CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true, + zone); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + ChoiceNode* result = zone->New<ChoiceNode>(2, zone); + // Add two choices. The (non-)boundary could start with a word or + // a non-word-character. + for (int i = 0; i < 2; i++) { + bool lookbehind_for_word = i == 0; + bool lookahead_for_word = + (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word; + // Look to the left. + RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, + stack_register, position_register); + RegExpNode* backward = TextNode::CreateForCharacterRanges( + zone, word_range, true, lookbehind.on_match_success()); + // Look to the right. + RegExpLookaround::Builder lookahead(lookahead_for_word, + lookbehind.ForMatch(backward), + stack_register, position_register); + RegExpNode* forward = TextNode::CreateForCharacterRanges( + zone, word_range, false, lookahead.on_match_success()); + result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); + } + return result; +} +} // anonymous namespace + +RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + NodeInfo info; + Zone* zone = compiler->zone(); + + switch (assertion_type()) { + case Type::START_OF_LINE: + return AssertionNode::AfterNewline(on_success); + case Type::START_OF_INPUT: + return AssertionNode::AtStart(on_success); + case Type::BOUNDARY: + return NeedsUnicodeCaseEquivalents(compiler->flags()) + ? BoundaryAssertionAsLookaround( + compiler, on_success, Type::BOUNDARY, compiler->flags()) + : AssertionNode::AtBoundary(on_success); + case Type::NON_BOUNDARY: + return NeedsUnicodeCaseEquivalents(compiler->flags()) + ? BoundaryAssertionAsLookaround(compiler, on_success, + Type::NON_BOUNDARY, + compiler->flags()) + : AssertionNode::AtNonBoundary(on_success); + case Type::END_OF_INPUT: + return AssertionNode::AtEnd(on_success); + case Type::END_OF_LINE: { + // Compile $ in multiline regexps as an alternation with a positive + // lookahead in one side and an end-of-input on the other side. + // We need two registers for the lookahead. + int stack_pointer_register = compiler->AllocateRegister(); + int position_register = compiler->AllocateRegister(); + // The ChoiceNode to distinguish between a newline and end-of-input. + ChoiceNode* result = zone->New<ChoiceNode>(2, zone); + // Create a newline atom. + ZoneList<CharacterRange>* newline_ranges = + zone->New<ZoneList<CharacterRange>>(3, zone); + CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator, + newline_ranges, false, zone); + RegExpClassRanges* newline_atom = + zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator); + TextNode* newline_matcher = + zone->New<TextNode>(newline_atom, false, + ActionNode::PositiveSubmatchSuccess( + stack_pointer_register, position_register, + 0, // No captures inside. + -1, // Ignored if no captures. + on_success)); + // Create an end-of-input matcher. + RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch( + stack_pointer_register, position_register, newline_matcher); + // Add the two alternatives to the ChoiceNode. + GuardedAlternative eol_alternative(end_of_line); + result->AddAlternative(eol_alternative); + GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); + result->AddAlternative(end_alternative); + return result; + } + default: + UNREACHABLE(); + } +} + +RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return compiler->zone()->New<BackReferenceNode>( + RegExpCapture::StartRegister(index()), + RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(), + on_success); +} + +RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return on_success; +} + +RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return body_->ToNode(compiler, on_success); +} + +RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, + int stack_pointer_register, + int position_register, + int capture_register_count, + int capture_register_start) + : is_positive_(is_positive), + on_success_(on_success), + stack_pointer_register_(stack_pointer_register), + position_register_(position_register) { + if (is_positive_) { + on_match_success_ = ActionNode::PositiveSubmatchSuccess( + stack_pointer_register, position_register, capture_register_count, + capture_register_start, on_success_); + } else { + Zone* zone = on_success_->zone(); + on_match_success_ = zone->New<NegativeSubmatchSuccess>( + stack_pointer_register, position_register, capture_register_count, + capture_register_start, zone); + } +} + +RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { + if (is_positive_) { + return ActionNode::BeginPositiveSubmatch(stack_pointer_register_, + position_register_, match); + } else { + Zone* zone = on_success_->zone(); + // We use a ChoiceNode to represent the negative lookaround. The first + // alternative is the negative match. On success, the end node backtracks. + // On failure, the second alternative is tried and leads to success. + // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the + // first exit when calculating quick checks. + ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>( + GuardedAlternative(match), GuardedAlternative(on_success_), zone); + return ActionNode::BeginNegativeSubmatch(stack_pointer_register_, + position_register_, choice_node); + } +} + +RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + int stack_pointer_register = compiler->AllocateRegister(); + int position_register = compiler->AllocateRegister(); + + const int registers_per_capture = 2; + const int register_of_first_capture = 2; + int register_count = capture_count_ * registers_per_capture; + int register_start = + register_of_first_capture + capture_from_ * registers_per_capture; + + RegExpNode* result; + bool was_reading_backward = compiler->read_backward(); + compiler->set_read_backward(type() == LOOKBEHIND); + Builder builder(is_positive(), on_success, stack_pointer_register, + position_register, register_count, register_start); + RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); + result = builder.ForMatch(match); + compiler->set_read_backward(was_reading_backward); + return result; +} + +RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return ToNode(body(), index(), compiler, on_success); +} + +RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, + RegExpCompiler* compiler, + RegExpNode* on_success) { + DCHECK_NOT_NULL(body); + int start_reg = RegExpCapture::StartRegister(index); + int end_reg = RegExpCapture::EndRegister(index); + if (compiler->read_backward()) std::swap(start_reg, end_reg); + RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); + RegExpNode* body_node = body->ToNode(compiler, store_end); + return ActionNode::StorePosition(start_reg, true, body_node); +} + +namespace { + +class AssertionSequenceRewriter final { + public: + // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass + // instead of sprinkling rewrites into the AST->Node conversion process. + static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) { + AssertionSequenceRewriter rewriter(terms, zone); + + static constexpr int kNoIndex = -1; + int from = kNoIndex; + + for (int i = 0; i < terms->length(); i++) { + RegExpTree* t = terms->at(i); + if (from == kNoIndex && t->IsAssertion()) { + from = i; // Start a sequence. + } else if (from != kNoIndex && !t->IsAssertion()) { + // Terminate and process the sequence. + if (i - from > 1) rewriter.Rewrite(from, i); + from = kNoIndex; + } + } + + if (from != kNoIndex && terms->length() - from > 1) { + rewriter.Rewrite(from, terms->length()); + } + } + + // All assertions are zero width. A consecutive sequence of assertions is + // order-independent. There's two ways we can optimize here: + // 1. fold all identical assertions. + // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire + // sequence fails. + void Rewrite(int from, int to) { + DCHECK_GT(to, from + 1); + + // Bitfield of all seen assertions. + uint32_t seen_assertions = 0; + static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) < + kUInt32Size * kBitsPerByte); + + for (int i = from; i < to; i++) { + RegExpAssertion* t = terms_->at(i)->AsAssertion(); + const uint32_t bit = 1 << static_cast<int>(t->assertion_type()); + + if (seen_assertions & bit) { + // Fold duplicates. + terms_->Set(i, zone_->New<RegExpEmpty>()); + } + + seen_assertions |= bit; + } + + // Collapse failures. + const uint32_t always_fails_mask = + 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) | + 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY); + if ((seen_assertions & always_fails_mask) == always_fails_mask) { + ReplaceSequenceWithFailure(from, to); + } + } + + void ReplaceSequenceWithFailure(int from, int to) { + // Replace the entire sequence with a single node that always fails. + // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the + // negated '*' (everything) range serves the purpose. + ZoneList<CharacterRange>* ranges = + zone_->New<ZoneList<CharacterRange>>(0, zone_); + RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges); + terms_->Set(from, cc); + + // Zero out the rest. + RegExpEmpty* empty = zone_->New<RegExpEmpty>(); + for (int i = from + 1; i < to; i++) terms_->Set(i, empty); + } + + private: + AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone) + : zone_(zone), terms_(terms) {} + + Zone* zone_; + ZoneList<RegExpTree*>* terms_; +}; + +} // namespace + +RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + compiler->ToNodeMaybeCheckForStackOverflow(); + + ZoneList<RegExpTree*>* children = nodes(); + + AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); + + RegExpNode* current = on_success; + if (compiler->read_backward()) { + for (int i = 0; i < children->length(); i++) { + current = children->at(i)->ToNode(compiler, current); + } + } else { + for (int i = children->length() - 1; i >= 0; i--) { + current = children->at(i)->ToNode(compiler, current); + } + } + return current; +} + +namespace { + +void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges, + Zone* zone) { + elmc--; + DCHECK_EQ(kRangeEndMarker, elmv[elmc]); + for (int i = 0; i < elmc; i += 2) { + DCHECK(elmv[i] < elmv[i + 1]); + ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); + } +} + +void AddClassNegated(const int* elmv, int elmc, + ZoneList<CharacterRange>* ranges, Zone* zone) { + elmc--; + DCHECK_EQ(kRangeEndMarker, elmv[elmc]); + DCHECK_NE(0x0000, elmv[0]); + DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]); + base::uc16 last = 0x0000; + for (int i = 0; i < elmc; i += 2) { + DCHECK(last <= elmv[i] - 1); + DCHECK(elmv[i] < elmv[i + 1]); + ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); + last = elmv[i + 1]; + } + ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone); +} + +} // namespace + +void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set, + ZoneList<CharacterRange>* ranges, + bool add_unicode_case_equivalents, + Zone* zone) { + if (add_unicode_case_equivalents && + (standard_character_set == StandardCharacterSet::kWord || + standard_character_set == StandardCharacterSet::kNotWord)) { + // See #sec-runtime-semantics-wordcharacters-abstract-operation + // In case of unicode and ignore_case, we need to create the closure over + // case equivalent characters before negating. + ZoneList<CharacterRange>* new_ranges = + zone->New<ZoneList<CharacterRange>>(2, zone); + AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); + AddUnicodeCaseEquivalents(new_ranges, zone); + if (standard_character_set == StandardCharacterSet::kNotWord) { + ZoneList<CharacterRange>* negated = + zone->New<ZoneList<CharacterRange>>(2, zone); + CharacterRange::Negate(new_ranges, negated, zone); + new_ranges = negated; + } + ranges->AddAll(*new_ranges, zone); + return; + } + + switch (standard_character_set) { + case StandardCharacterSet::kWhitespace: + AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); + break; + case StandardCharacterSet::kNotWhitespace: + AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); + break; + case StandardCharacterSet::kWord: + AddClass(kWordRanges, kWordRangeCount, ranges, zone); + break; + case StandardCharacterSet::kNotWord: + AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); + break; + case StandardCharacterSet::kDigit: + AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); + break; + case StandardCharacterSet::kNotDigit: + AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); + break; + // This is the set of characters matched by the $ and ^ symbols + // in multiline mode. + case StandardCharacterSet::kLineTerminator: + AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); + break; + case StandardCharacterSet::kNotLineTerminator: + AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, + zone); + break; + // This is not a character range as defined by the spec but a + // convenient shorthand for a character class that matches any + // character. + case StandardCharacterSet::kEverything: + ranges->Add(CharacterRange::Everything(), zone); + break; + } +} + +// static +void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, + ZoneList<CharacterRange>* ranges, + bool is_one_byte) { + CharacterRange::Canonicalize(ranges); + int range_count = ranges->length(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeSet others; + for (int i = 0; i < range_count; i++) { + CharacterRange range = ranges->at(i); + base::uc32 from = range.from(); + if (from > kMaxUtf16CodeUnit) continue; + base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU}); + // Nothing to be done for surrogates. + if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { + if (from > kMaxOneByteCharCode) continue; + if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode; + } + others.add(from, to); + } + + // Compute the set of additional characters that should be added, + // using UnicodeSet::closeOver. ECMA 262 defines slightly different + // case-folding rules than Unicode, so some characters that are + // added by closeOver do not match anything other than themselves in + // JS. For example, 'Å¿' (U+017F LATIN SMALL LETTER LONG S) is the + // same case-insensitive character as 's' or 'S' according to + // Unicode, but does not match any other character in JS. To handle + // this case, we add such characters to the IgnoreSet and filter + // them out. We filter twice: once before calling closeOver (to + // prevent 'Å¿' from adding 's'), and once after calling closeOver + // (to prevent 's' from adding 'Å¿'). See regexp/special-case.h for + // more information. + icu::UnicodeSet already_added(others); + others.removeAll(RegExpCaseFolding::IgnoreSet()); + others.closeOver(USET_CASE_INSENSITIVE); + others.removeAll(RegExpCaseFolding::IgnoreSet()); + others.removeAll(already_added); + + // Add others to the ranges + for (int32_t i = 0; i < others.getRangeCount(); i++) { + UChar32 from = others.getRangeStart(i); + UChar32 to = others.getRangeEnd(i); + if (from == to) { + ranges->Add(CharacterRange::Singleton(from), zone); + } else { + ranges->Add(CharacterRange::Range(from, to), zone); + } + } +#else + for (int i = 0; i < range_count; i++) { + CharacterRange range = ranges->at(i); + base::uc32 bottom = range.from(); + if (bottom > kMaxUtf16CodeUnit) continue; + base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU}); + // Nothing to be done for surrogates. + if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { + if (bottom > kMaxOneByteCharCode) continue; + if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode; + } + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + if (top == bottom) { + // If this is a singleton we just expand the one character. + int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); + for (int i = 0; i < length; i++) { + base::uc32 chr = chars[i]; + if (chr != bottom) { + ranges->Add(CharacterRange::Singleton(chars[i]), zone); + } + } + } else { + // If this is a range we expand the characters block by block, expanding + // contiguous subranges (blocks) one at a time. The approach is as + // follows. For a given start character we look up the remainder of the + // block that contains it (represented by the end point), for instance we + // find 'z' if the character is 'c'. A block is characterized by the + // property that all characters uncanonicalize in the same way, except + // that each entry in the result is incremented by the distance from the + // first element. So a-z is a block because 'a' uncanonicalizes to ['a', + // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once + // we've found the end point we look up its uncanonicalization and + // produce a range for each element. For instance for [c-f] we look up + // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if + // it is not already contained in the input, so [c-f] will be skipped but + // [C-F] will be added. If this range is not completely contained in a + // block we do this for all the blocks covered by the range (handling + // characters that is not in a block as a "singleton block"). + unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + base::uc32 pos = bottom; + while (pos <= top) { + int length = + isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); + base::uc32 block_end; + if (length == 0) { + block_end = pos; + } else { + DCHECK_EQ(1, length); + block_end = equivalents[0]; + } + int end = (block_end > top) ? top : block_end; + length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', + equivalents); + for (int i = 0; i < length; i++) { + base::uc32 c = equivalents[i]; + base::uc32 range_from = c - (block_end - pos); + base::uc32 range_to = c - (block_end - end); + if (!(bottom <= range_from && range_to <= top)) { + ranges->Add(CharacterRange::Range(range_from, range_to), zone); + } + } + pos = end + 1; + } + } + } +#endif // V8_INTL_SUPPORT +} + +bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) { + DCHECK_NOT_NULL(ranges); + int n = ranges->length(); + if (n <= 1) return true; + base::uc32 max = ranges->at(0).to(); + for (int i = 1; i < n; i++) { + CharacterRange next_range = ranges->at(i); + if (next_range.from() <= max + 1) return false; + max = next_range.to(); + } + return true; +} + +ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { + if (ranges_ == nullptr) { + ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone); + CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false, + zone); + } + return ranges_; +} + +namespace { + +// Move a number of elements in a zonelist to another position +// in the same list. Handles overlapping source and target areas. +void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) { + // Ranges are potentially overlapping. + if (from < to) { + for (int i = count - 1; i >= 0; i--) { + list->at(to + i) = list->at(from + i); + } + } else { + for (int i = 0; i < count; i++) { + list->at(to + i) = list->at(from + i); + } + } +} + +int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count, + CharacterRange insert) { + // Inserts a range into list[0..count[, which must be sorted + // by from value and non-overlapping and non-adjacent, using at most + // list[0..count] for the result. Returns the number of resulting + // canonicalized ranges. Inserting a range may collapse existing ranges into + // fewer ranges, so the return value can be anything in the range 1..count+1. + base::uc32 from = insert.from(); + base::uc32 to = insert.to(); + int start_pos = 0; + int end_pos = count; + for (int i = count - 1; i >= 0; i--) { + CharacterRange current = list->at(i); + if (current.from() > to + 1) { + end_pos = i; + } else if (current.to() + 1 < from) { + start_pos = i + 1; + break; + } + } + + // Inserted range overlaps, or is adjacent to, ranges at positions + // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are + // not affected by the insertion. + // If start_pos == end_pos, the range must be inserted before start_pos. + // if start_pos < end_pos, the entire range from start_pos to end_pos + // must be merged with the insert range. + + if (start_pos == end_pos) { + // Insert between existing ranges at position start_pos. + if (start_pos < count) { + MoveRanges(list, start_pos, start_pos + 1, count - start_pos); + } + list->at(start_pos) = insert; + return count + 1; + } + if (start_pos + 1 == end_pos) { + // Replace single existing range at position start_pos. + CharacterRange to_replace = list->at(start_pos); + int new_from = std::min(to_replace.from(), from); + int new_to = std::max(to_replace.to(), to); + list->at(start_pos) = CharacterRange::Range(new_from, new_to); + return count; + } + // Replace a number of existing ranges from start_pos to end_pos - 1. + // Move the remaining ranges down. + + int new_from = std::min(list->at(start_pos).from(), from); + int new_to = std::max(list->at(end_pos - 1).to(), to); + if (end_pos < count) { + MoveRanges(list, end_pos, start_pos + 1, count - end_pos); + } + list->at(start_pos) = CharacterRange::Range(new_from, new_to); + return count - (end_pos - start_pos) + 1; +} + +} // namespace + +void CharacterSet::Canonicalize() { + // Special/default classes are always considered canonical. The result + // of calling ranges() will be sorted. + if (ranges_ == nullptr) return; + CharacterRange::Canonicalize(ranges_); +} + +// static +void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { + if (character_ranges->length() <= 1) return; + // Check whether ranges are already canonical (increasing, non-overlapping, + // non-adjacent). + int n = character_ranges->length(); + base::uc32 max = character_ranges->at(0).to(); + int i = 1; + while (i < n) { + CharacterRange current = character_ranges->at(i); + if (current.from() <= max + 1) { + break; + } + max = current.to(); + i++; + } + // Canonical until the i'th range. If that's all of them, we are done. + if (i == n) return; + + // The ranges at index i and forward are not canonicalized. Make them so by + // doing the equivalent of insertion sort (inserting each into the previous + // list, in order). + // Notice that inserting a range can reduce the number of ranges in the + // result due to combining of adjacent and overlapping ranges. + int read = i; // Range to insert. + int num_canonical = i; // Length of canonicalized part of list. + do { + num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical, + character_ranges->at(read)); + read++; + } while (read < n); + character_ranges->Rewind(num_canonical); + + DCHECK(CharacterRange::IsCanonical(character_ranges)); +} + +// static +void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges, + ZoneList<CharacterRange>* negated_ranges, + Zone* zone) { + DCHECK(CharacterRange::IsCanonical(ranges)); + DCHECK_EQ(0, negated_ranges->length()); + int range_count = ranges->length(); + base::uc32 from = 0; + int i = 0; + if (range_count > 0 && ranges->at(0).from() == 0) { + from = ranges->at(0).to() + 1; + i = 1; + } + while (i < range_count) { + CharacterRange range = ranges->at(i); + negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); + from = range.to() + 1; + i++; + } + if (from < kMaxCodePoint) { + negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone); + } +} + +// static +void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs, + const ZoneList<CharacterRange>* rhs, + ZoneList<CharacterRange>* intersection, + Zone* zone) { + DCHECK(CharacterRange::IsCanonical(lhs)); + DCHECK(CharacterRange::IsCanonical(rhs)); + DCHECK_EQ(0, intersection->length()); + int lhs_index = 0; + int rhs_index = 0; + while (lhs_index < lhs->length() && rhs_index < rhs->length()) { + // Skip non-overlapping ranges. + if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) { + lhs_index++; + continue; + } + if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) { + rhs_index++; + continue; + } + + base::uc32 from = + std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from()); + base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to()); + intersection->Add(CharacterRange::Range(from, to), zone); + if (to == lhs->at(lhs_index).to()) { + lhs_index++; + } else { + rhs_index++; + } + } + + DCHECK(IsCanonical(intersection)); +} + +namespace { + +// Advance |index| and set |from| and |to| to the new range, if not out of +// bounds of |range|, otherwise |from| is set to a code point beyond the legal +// unicode character range. +void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index, + base::uc32* from, base::uc32* to) { + ++(*index); + if (*index < range->length()) { + *from = range->at(*index).from(); + *to = range->at(*index).to(); + } else { + *from = kMaxCodePoint + 1; + } +} + +} // namespace + +// static +void CharacterRange::Subtract(const ZoneList<CharacterRange>* src, + const ZoneList<CharacterRange>* to_remove, + ZoneList<CharacterRange>* result, Zone* zone) { + DCHECK(CharacterRange::IsCanonical(src)); + DCHECK(CharacterRange::IsCanonical(to_remove)); + DCHECK_EQ(0, result->length()); + + if (src->is_empty()) return; + + int src_index = 0; + int to_remove_index = 0; + base::uc32 from = src->at(src_index).from(); + base::uc32 to = src->at(src_index).to(); + while (src_index < src->length() && to_remove_index < to_remove->length()) { + CharacterRange remove_range = to_remove->at(to_remove_index); + if (remove_range.to() < from) { + // (a) Non-overlapping case, ignore current to_remove range. + // |-------| + // |-------| + to_remove_index++; + } else if (to < remove_range.from()) { + // (b) Non-overlapping case, add full current range to result. + // |-------| + // |-------| + result->Add(CharacterRange::Range(from, to), zone); + SafeAdvanceRange(src, &src_index, &from, &to); + } else if (from >= remove_range.from() && to <= remove_range.to()) { + // (c) Current to_remove range fully covers current range. + // |---| + // |-------| + SafeAdvanceRange(src, &src_index, &from, &to); + } else if (from < remove_range.from() && to > remove_range.to()) { + // (d) Split current range. + // |-------| + // |---| + result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone); + from = remove_range.to() + 1; + to_remove_index++; + } else if (from < remove_range.from()) { + // (e) End current range. + // |-------| + // |-------| + to = remove_range.from() - 1; + result->Add(CharacterRange::Range(from, to), zone); + SafeAdvanceRange(src, &src_index, &from, &to); + } else if (to > remove_range.to()) { + // (f) Modify start of current range. + // |-------| + // |-------| + from = remove_range.to() + 1; + to_remove_index++; + } else { + UNREACHABLE(); + } + } + // The last range needs special treatment after |to_remove| is exhausted, as + // |from| might have been modified by the last |to_remove| range and |to| was + // not yet known (i.e. cases d and f). + if (from <= to) { + result->Add(CharacterRange::Range(from, to), zone); + } + src_index++; + + // Add remaining ranges after |to_remove| is exhausted. + for (; src_index < src->length(); src_index++) { + result->Add(src->at(src_index), zone); + } + + DCHECK(IsCanonical(result)); +} + +// static +void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) { + DCHECK(IsCanonical(ranges)); + + // Drop all ranges that don't contain one-byte code units, and clamp the last + // range s.t. it likewise only contains one-byte code units. Note this relies + // on `ranges` being canonicalized, i.e. sorted and non-overlapping. + + static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU; + int n = ranges->length(); + for (; n > 0; n--) { + CharacterRange& r = ranges->at(n - 1); + if (r.from() <= max_char) { + r.to_ = std::min(r.to_, max_char); + break; + } + } + + ranges->Rewind(n); +} + +// static +bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs, + const ZoneList<CharacterRange>* rhs) { + DCHECK(IsCanonical(lhs)); + DCHECK(IsCanonical(rhs)); + if (lhs->length() != rhs->length()) return false; + + for (int i = 0; i < lhs->length(); i++) { + if (lhs->at(i) != rhs->at(i)) return false; + } + + return true; +} + +namespace { + +// Scoped object to keep track of how much we unroll quantifier loops in the +// regexp graph generator. +class RegExpExpansionLimiter { + public: + static const int kMaxExpansionFactor = 6; + RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) + : compiler_(compiler), + saved_expansion_factor_(compiler->current_expansion_factor()), + ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { + DCHECK_LT(0, factor); + if (ok_to_expand_) { + if (factor > kMaxExpansionFactor) { + // Avoid integer overflow of the current expansion factor. + ok_to_expand_ = false; + compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); + } else { + int new_factor = saved_expansion_factor_ * factor; + ok_to_expand_ = (new_factor <= kMaxExpansionFactor); + compiler->set_current_expansion_factor(new_factor); + } + } + } + + ~RegExpExpansionLimiter() { + compiler_->set_current_expansion_factor(saved_expansion_factor_); + } + + bool ok_to_expand() { return ok_to_expand_; } + + private: + RegExpCompiler* compiler_; + int saved_expansion_factor_; + bool ok_to_expand_; + + DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); +}; + +} // namespace + +RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, + RegExpTree* body, RegExpCompiler* compiler, + RegExpNode* on_success, + bool not_at_start) { + // x{f, t} becomes this: + // + // (r++)<-. + // | ` + // | (x) + // v ^ + // (r=0)-->(?)---/ [if r < t] + // | + // [if r >= f] \----> ... + // + + // 15.10.2.5 RepeatMatcher algorithm. + // The parser has already eliminated the case where max is 0. In the case + // where max_match is zero the parser has removed the quantifier if min was + // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. + + // If we know that we cannot match zero length then things are a little + // simpler since we don't need to make the special zero length match check + // from step 2.1. If the min and max are small we can unroll a little in + // this case. + static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} + static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} + if (max == 0) return on_success; // This can happen due to recursion. + bool body_can_be_empty = (body->min_match() == 0); + int body_start_reg = RegExpCompiler::kNoRegister; + Interval capture_registers = body->CaptureRegisters(); + bool needs_capture_clearing = !capture_registers.is_empty(); + Zone* zone = compiler->zone(); + + if (body_can_be_empty) { + body_start_reg = compiler->AllocateRegister(); + } else if (compiler->optimize() && !needs_capture_clearing) { + // Only unroll if there are no captures and the body can't be + // empty. + { + RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0)); + if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { + int new_max = (max == kInfinity) ? max : max - min; + // Recurse once to get the loop or optional matches after the fixed + // ones. + RegExpNode* answer = + ToNode(0, new_max, is_greedy, body, compiler, on_success, true); + // Unroll the forced matches from 0 to min. This can cause chains of + // TextNodes (which the parser does not generate). These should be + // combined if it turns out they hinder good code generation. + for (int i = 0; i < min; i++) { + answer = body->ToNode(compiler, answer); + } + return answer; + } + } + if (max <= kMaxUnrolledMaxMatches && min == 0) { + DCHECK_LT(0, max); // Due to the 'if' above. + RegExpExpansionLimiter limiter(compiler, max); + if (limiter.ok_to_expand()) { + // Unroll the optional matches up to max. + RegExpNode* answer = on_success; + for (int i = 0; i < max; i++) { + ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone); + if (is_greedy) { + alternation->AddAlternative( + GuardedAlternative(body->ToNode(compiler, answer))); + alternation->AddAlternative(GuardedAlternative(on_success)); + } else { + alternation->AddAlternative(GuardedAlternative(on_success)); + alternation->AddAlternative( + GuardedAlternative(body->ToNode(compiler, answer))); + } + answer = alternation; + if (not_at_start && !compiler->read_backward()) { + alternation->set_not_at_start(); + } + } + return answer; + } + } + } + bool has_min = min > 0; + bool has_max = max < RegExpTree::kInfinity; + bool needs_counter = has_min || has_max; + int reg_ctr = needs_counter ? compiler->AllocateRegister() + : RegExpCompiler::kNoRegister; + LoopChoiceNode* center = zone->New<LoopChoiceNode>( + body->min_match() == 0, compiler->read_backward(), min, zone); + if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); + RegExpNode* loop_return = + needs_counter ? static_cast<RegExpNode*>( + ActionNode::IncrementRegister(reg_ctr, center)) + : static_cast<RegExpNode*>(center); + if (body_can_be_empty) { + // If the body can be empty we need to check if it was and then + // backtrack. + loop_return = + ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return); + } + RegExpNode* body_node = body->ToNode(compiler, loop_return); + if (body_can_be_empty) { + // If the body can be empty we need to store the start position + // so we can bail out if it was empty. + body_node = ActionNode::StorePosition(body_start_reg, false, body_node); + } + if (needs_capture_clearing) { + // Before entering the body of this loop we need to clear captures. + body_node = ActionNode::ClearCaptures(capture_registers, body_node); + } + GuardedAlternative body_alt(body_node); + if (has_max) { + Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max); + body_alt.AddGuard(body_guard, zone); + } + GuardedAlternative rest_alt(on_success); + if (has_min) { + Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min); + rest_alt.AddGuard(rest_guard, zone); + } + if (is_greedy) { + center->AddLoopAlternative(body_alt); + center->AddContinueAlternative(rest_alt); + } else { + center->AddContinueAlternative(rest_alt); + center->AddLoopAlternative(body_alt); + } + if (needs_counter) { + return ActionNode::SetRegisterForLoop(reg_ctr, 0, center); + } else { + return center; + } +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-compiler.cc b/js/src/irregexp/imported/regexp-compiler.cc new file mode 100644 index 0000000000..514975d8ed --- /dev/null +++ b/js/src/irregexp/imported/regexp-compiler.cc @@ -0,0 +1,3955 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-compiler.h" + +#include "irregexp/imported/regexp-macro-assembler-arch.h" + +#ifdef V8_INTL_SUPPORT +#include "irregexp/imported/special-case.h" +#include "unicode/locid.h" +#include "unicode/uniset.h" +#include "unicode/utypes.h" +#endif // V8_INTL_SUPPORT + +namespace v8 { +namespace internal { + +using namespace regexp_compiler_constants; // NOLINT(build/namespaces) + +// ------------------------------------------------------------------- +// Implementation of the Irregexp regular expression engine. +// +// The Irregexp regular expression engine is intended to be a complete +// implementation of ECMAScript regular expressions. It generates either +// bytecodes or native code. + +// The Irregexp regexp engine is structured in three steps. +// 1) The parser generates an abstract syntax tree. See ast.cc. +// 2) From the AST a node network is created. The nodes are all +// subclasses of RegExpNode. The nodes represent states when +// executing a regular expression. Several optimizations are +// performed on the node network. +// 3) From the nodes we generate either byte codes or native code +// that can actually execute the regular expression (perform +// the search). The code generation step is described in more +// detail below. + +// Code generation. +// +// The nodes are divided into four main categories. +// * Choice nodes +// These represent places where the regular expression can +// match in more than one way. For example on entry to an +// alternation (foo|bar) or a repetition (*, +, ? or {}). +// * Action nodes +// These represent places where some action should be +// performed. Examples include recording the current position +// in the input string to a register (in order to implement +// captures) or other actions on register for example in order +// to implement the counters needed for {} repetitions. +// * Matching nodes +// These attempt to match some element part of the input string. +// Examples of elements include character classes, plain strings +// or back references. +// * End nodes +// These are used to implement the actions required on finding +// a successful match or failing to find a match. +// +// The code generated (whether as byte codes or native code) maintains +// some state as it runs. This consists of the following elements: +// +// * The capture registers. Used for string captures. +// * Other registers. Used for counters etc. +// * The current position. +// * The stack of backtracking information. Used when a matching node +// fails to find a match and needs to try an alternative. +// +// Conceptual regular expression execution model: +// +// There is a simple conceptual model of regular expression execution +// which will be presented first. The actual code generated is a more +// efficient simulation of the simple conceptual model: +// +// * Choice nodes are implemented as follows: +// For each choice except the last { +// push current position +// push backtrack code location +// <generate code to test for choice> +// backtrack code location: +// pop current position +// } +// <generate code to test for last choice> +// +// * Actions nodes are generated as follows +// <push affected registers on backtrack stack> +// <generate code to perform action> +// push backtrack code location +// <generate code to test for following nodes> +// backtrack code location: +// <pop affected registers to restore their state> +// <pop backtrack location from stack and go to it> +// +// * Matching nodes are generated as follows: +// if input string matches at current position +// update current position +// <generate code to test for following nodes> +// else +// <pop backtrack location from stack and go to it> +// +// Thus it can be seen that the current position is saved and restored +// by the choice nodes, whereas the registers are saved and restored by +// by the action nodes that manipulate them. +// +// The other interesting aspect of this model is that nodes are generated +// at the point where they are needed by a recursive call to Emit(). If +// the node has already been code generated then the Emit() call will +// generate a jump to the previously generated code instead. In order to +// limit recursion it is possible for the Emit() function to put the node +// on a work list for later generation and instead generate a jump. The +// destination of the jump is resolved later when the code is generated. +// +// Actual regular expression code generation. +// +// Code generation is actually more complicated than the above. In order to +// improve the efficiency of the generated code some optimizations are +// performed +// +// * Choice nodes have 1-character lookahead. +// A choice node looks at the following character and eliminates some of +// the choices immediately based on that character. This is not yet +// implemented. +// * Simple greedy loops store reduced backtracking information. +// A quantifier like /.*foo/m will greedily match the whole input. It will +// then need to backtrack to a point where it can match "foo". The naive +// implementation of this would push each character position onto the +// backtracking stack, then pop them off one by one. This would use space +// proportional to the length of the input string. However since the "." +// can only match in one way and always has a constant length (in this case +// of 1) it suffices to store the current position on the top of the stack +// once. Matching now becomes merely incrementing the current position and +// backtracking becomes decrementing the current position and checking the +// result against the stored current position. This is faster and saves +// space. +// * The current state is virtualized. +// This is used to defer expensive operations until it is clear that they +// are needed and to generate code for a node more than once, allowing +// specialized an efficient versions of the code to be created. This is +// explained in the section below. +// +// Execution state virtualization. +// +// Instead of emitting code, nodes that manipulate the state can record their +// manipulation in an object called the Trace. The Trace object can record a +// current position offset, an optional backtrack code location on the top of +// the virtualized backtrack stack and some register changes. When a node is +// to be emitted it can flush the Trace or update it. Flushing the Trace +// will emit code to bring the actual state into line with the virtual state. +// Avoiding flushing the state can postpone some work (e.g. updates of capture +// registers). Postponing work can save time when executing the regular +// expression since it may be found that the work never has to be done as a +// failure to match can occur. In addition it is much faster to jump to a +// known backtrack code location than it is to pop an unknown backtrack +// location from the stack and jump there. +// +// The virtual state found in the Trace affects code generation. For example +// the virtual state contains the difference between the actual current +// position and the virtual current position, and matching code needs to use +// this offset to attempt a match in the correct location of the input +// string. Therefore code generated for a non-trivial trace is specialized +// to that trace. The code generator therefore has the ability to generate +// code for each node several times. In order to limit the size of the +// generated code there is an arbitrary limit on how many specialized sets of +// code may be generated for a given node. If the limit is reached, the +// trace is flushed and a generic version of the code for a node is emitted. +// This is subsequently used for that node. The code emitted for non-generic +// trace is not recorded in the node and so it cannot currently be reused in +// the event that code generation is requested for an identical trace. + +namespace { + +constexpr base::uc32 MaxCodeUnit(const bool one_byte) { + static_assert(String::kMaxOneByteCharCodeU <= + std::numeric_limits<uint16_t>::max()); + static_assert(String::kMaxUtf16CodeUnitU <= + std::numeric_limits<uint16_t>::max()); + return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU; +} + +constexpr uint32_t CharMask(const bool one_byte) { + static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); + static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); + return MaxCodeUnit(one_byte); +} + +} // namespace + +void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); } + +void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { + text->AddElement(TextElement::Atom(this), zone); +} + +void RegExpClassRanges::AppendToText(RegExpText* text, Zone* zone) { + text->AddElement(TextElement::ClassRanges(this), zone); +} + +void RegExpText::AppendToText(RegExpText* text, Zone* zone) { + for (int i = 0; i < elements()->length(); i++) + text->AddElement(elements()->at(i), zone); +} + +TextElement TextElement::Atom(RegExpAtom* atom) { + return TextElement(ATOM, atom); +} + +TextElement TextElement::ClassRanges(RegExpClassRanges* class_ranges) { + return TextElement(CLASS_RANGES, class_ranges); +} + +int TextElement::length() const { + switch (text_type()) { + case ATOM: + return atom()->length(); + + case CLASS_RANGES: + return 1; + } + UNREACHABLE(); +} + +class RecursionCheck { + public: + explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { + compiler->IncrementRecursionDepth(); + } + ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } + + private: + RegExpCompiler* compiler_; +}; + +// Attempts to compile the regexp using an Irregexp code generator. Returns +// a fixed array or a null handle depending on whether it succeeded. +RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, + RegExpFlags flags, bool one_byte) + : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)), + unicode_lookaround_stack_register_(kNoRegister), + unicode_lookaround_position_register_(kNoRegister), + work_list_(nullptr), + recursion_depth_(0), + flags_(flags), + one_byte_(one_byte), + reg_exp_too_big_(false), + limiting_recursion_(false), + optimize_(v8_flags.regexp_optimization), + read_backward_(false), + current_expansion_factor_(1), + frequency_collator_(), + isolate_(isolate), + zone_(zone) { + accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone); + DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1); +} + +RegExpCompiler::CompilationResult RegExpCompiler::Assemble( + Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, + int capture_count, Handle<String> pattern) { + macro_assembler_ = macro_assembler; + + ZoneVector<RegExpNode*> work_list(zone()); + work_list_ = &work_list; + Label fail; + macro_assembler_->PushBacktrack(&fail); + Trace new_trace; + start->Emit(this, &new_trace); + macro_assembler_->BindJumpTarget(&fail); + macro_assembler_->Fail(); + while (!work_list.empty()) { + RegExpNode* node = work_list.back(); + work_list.pop_back(); + node->set_on_work_list(false); + if (!node->label()->is_bound()) node->Emit(this, &new_trace); + } + if (reg_exp_too_big_) { + if (v8_flags.correctness_fuzzer_suppressions) { + FATAL("Aborting on excess zone allocation"); + } + macro_assembler_->AbortedCodeGeneration(); + return CompilationResult::RegExpTooBig(); + } + + Handle<HeapObject> code = macro_assembler_->GetCode(pattern); + isolate->IncreaseTotalRegexpCodeGenerated(code); + work_list_ = nullptr; + + return {code, next_register_}; +} + +bool Trace::DeferredAction::Mentions(int that) { + if (action_type() == ActionNode::CLEAR_CAPTURES) { + Interval range = static_cast<DeferredClearCaptures*>(this)->range(); + return range.Contains(that); + } else { + return reg() == that; + } +} + +bool Trace::mentions_reg(int reg) { + for (DeferredAction* action = actions_; action != nullptr; + action = action->next()) { + if (action->Mentions(reg)) return true; + } + return false; +} + +bool Trace::GetStoredPosition(int reg, int* cp_offset) { + DCHECK_EQ(0, *cp_offset); + for (DeferredAction* action = actions_; action != nullptr; + action = action->next()) { + if (action->Mentions(reg)) { + if (action->action_type() == ActionNode::STORE_POSITION) { + *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); + return true; + } else { + return false; + } + } + } + return false; +} + +// A (dynamically-sized) set of unsigned integers that behaves especially well +// on small integers (< kFirstLimit). May do zone-allocation. +class DynamicBitSet : public ZoneObject { + public: + V8_EXPORT_PRIVATE bool Get(unsigned value) const { + if (value < kFirstLimit) { + return (first_ & (1 << value)) != 0; + } else if (remaining_ == nullptr) { + return false; + } else { + return remaining_->Contains(value); + } + } + + // Destructively set a value in this set. + void Set(unsigned value, Zone* zone) { + if (value < kFirstLimit) { + first_ |= (1 << value); + } else { + if (remaining_ == nullptr) + remaining_ = zone->New<ZoneList<unsigned>>(1, zone); + if (remaining_->is_empty() || !remaining_->Contains(value)) + remaining_->Add(value, zone); + } + } + + private: + static constexpr unsigned kFirstLimit = 32; + + uint32_t first_ = 0; + ZoneList<unsigned>* remaining_ = nullptr; +}; + +int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers, + Zone* zone) { + int max_register = RegExpCompiler::kNoRegister; + for (DeferredAction* action = actions_; action != nullptr; + action = action->next()) { + if (action->action_type() == ActionNode::CLEAR_CAPTURES) { + Interval range = static_cast<DeferredClearCaptures*>(action)->range(); + for (int i = range.from(); i <= range.to(); i++) + affected_registers->Set(i, zone); + if (range.to() > max_register) max_register = range.to(); + } else { + affected_registers->Set(action->reg(), zone); + if (action->reg() > max_register) max_register = action->reg(); + } + } + return max_register; +} + +void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, + int max_register, + const DynamicBitSet& registers_to_pop, + const DynamicBitSet& registers_to_clear) { + for (int reg = max_register; reg >= 0; reg--) { + if (registers_to_pop.Get(reg)) { + assembler->PopRegister(reg); + } else if (registers_to_clear.Get(reg)) { + int clear_to = reg; + while (reg > 0 && registers_to_clear.Get(reg - 1)) { + reg--; + } + assembler->ClearRegisters(reg, clear_to); + } + } +} + +void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, + int max_register, + const DynamicBitSet& affected_registers, + DynamicBitSet* registers_to_pop, + DynamicBitSet* registers_to_clear, + Zone* zone) { + // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. + const int push_limit = (assembler->stack_limit_slack() + 1) / 2; + + // Count pushes performed to force a stack limit check occasionally. + int pushes = 0; + + for (int reg = 0; reg <= max_register; reg++) { + if (!affected_registers.Get(reg)) continue; + + // The chronologically first deferred action in the trace + // is used to infer the action needed to restore a register + // to its previous state (or not, if it's safe to ignore it). + enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; + DeferredActionUndoType undo_action = IGNORE; + + int value = 0; + bool absolute = false; + bool clear = false; + static const int kNoStore = kMinInt; + int store_position = kNoStore; + // This is a little tricky because we are scanning the actions in reverse + // historical order (newest first). + for (DeferredAction* action = actions_; action != nullptr; + action = action->next()) { + if (action->Mentions(reg)) { + switch (action->action_type()) { + case ActionNode::SET_REGISTER_FOR_LOOP: { + Trace::DeferredSetRegisterForLoop* psr = + static_cast<Trace::DeferredSetRegisterForLoop*>(action); + if (!absolute) { + value += psr->value(); + absolute = true; + } + // SET_REGISTER_FOR_LOOP is only used for newly introduced loop + // counters. They can have a significant previous value if they + // occur in a loop. TODO(lrn): Propagate this information, so + // we can set undo_action to IGNORE if we know there is no value to + // restore. + undo_action = RESTORE; + DCHECK_EQ(store_position, kNoStore); + DCHECK(!clear); + break; + } + case ActionNode::INCREMENT_REGISTER: + if (!absolute) { + value++; + } + DCHECK_EQ(store_position, kNoStore); + DCHECK(!clear); + undo_action = RESTORE; + break; + case ActionNode::STORE_POSITION: { + Trace::DeferredCapture* pc = + static_cast<Trace::DeferredCapture*>(action); + if (!clear && store_position == kNoStore) { + store_position = pc->cp_offset(); + } + + // For captures we know that stores and clears alternate. + // Other register, are never cleared, and if the occur + // inside a loop, they might be assigned more than once. + if (reg <= 1) { + // Registers zero and one, aka "capture zero", is + // always set correctly if we succeed. There is no + // need to undo a setting on backtrack, because we + // will set it again or fail. + undo_action = IGNORE; + } else { + undo_action = pc->is_capture() ? CLEAR : RESTORE; + } + DCHECK(!absolute); + DCHECK_EQ(value, 0); + break; + } + case ActionNode::CLEAR_CAPTURES: { + // Since we're scanning in reverse order, if we've already + // set the position we have to ignore historically earlier + // clearing operations. + if (store_position == kNoStore) { + clear = true; + } + undo_action = RESTORE; + DCHECK(!absolute); + DCHECK_EQ(value, 0); + break; + } + default: + UNREACHABLE(); + } + } + } + // Prepare for the undo-action (e.g., push if it's going to be popped). + if (undo_action == RESTORE) { + pushes++; + RegExpMacroAssembler::StackCheckFlag stack_check = + RegExpMacroAssembler::kNoStackLimitCheck; + if (pushes == push_limit) { + stack_check = RegExpMacroAssembler::kCheckStackLimit; + pushes = 0; + } + + assembler->PushRegister(reg, stack_check); + registers_to_pop->Set(reg, zone); + } else if (undo_action == CLEAR) { + registers_to_clear->Set(reg, zone); + } + // Perform the chronologically last action (or accumulated increment) + // for the register. + if (store_position != kNoStore) { + assembler->WriteCurrentPositionToRegister(reg, store_position); + } else if (clear) { + assembler->ClearRegisters(reg, reg); + } else if (absolute) { + assembler->SetRegister(reg, value); + } else if (value != 0) { + assembler->AdvanceRegister(reg, value); + } + } +} + +// This is called as we come into a loop choice node and some other tricky +// nodes. It normalizes the state of the code generator to ensure we can +// generate generic code. +void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + + DCHECK(!is_trivial()); + + if (actions_ == nullptr && backtrack() == nullptr) { + // Here we just have some deferred cp advances to fix and we are back to + // a normal situation. We may also have to forget some information gained + // through a quick check that was already performed. + if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); + // Create a new trivial state and generate the node with that. + Trace new_state; + successor->Emit(compiler, &new_state); + return; + } + + // Generate deferred actions here along with code to undo them again. + DynamicBitSet affected_registers; + + if (backtrack() != nullptr) { + // Here we have a concrete backtrack location. These are set up by choice + // nodes and so they indicate that we have a deferred save of the current + // position which we may need to emit here. + assembler->PushCurrentPosition(); + } + + int max_register = + FindAffectedRegisters(&affected_registers, compiler->zone()); + DynamicBitSet registers_to_pop; + DynamicBitSet registers_to_clear; + PerformDeferredActions(assembler, max_register, affected_registers, + ®isters_to_pop, ®isters_to_clear, + compiler->zone()); + if (cp_offset_ != 0) { + assembler->AdvanceCurrentPosition(cp_offset_); + } + + // Create a new trivial state and generate the node with that. + Label undo; + assembler->PushBacktrack(&undo); + if (successor->KeepRecursing(compiler)) { + Trace new_state; + successor->Emit(compiler, &new_state); + } else { + compiler->AddWork(successor); + assembler->GoTo(successor->label()); + } + + // On backtrack we need to restore state. + assembler->BindJumpTarget(&undo); + RestoreAffectedRegisters(assembler, max_register, registers_to_pop, + registers_to_clear); + if (backtrack() == nullptr) { + assembler->Backtrack(); + } else { + assembler->PopCurrentPosition(); + assembler->GoTo(backtrack()); + } +} + +void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + + // Omit flushing the trace. We discard the entire stack frame anyway. + + if (!label()->is_bound()) { + // We are completely independent of the trace, since we ignore it, + // so this code can be used as the generic version. + assembler->Bind(label()); + } + + // Throw away everything on the backtrack stack since the start + // of the negative submatch and restore the character position. + assembler->ReadCurrentPositionFromRegister(current_position_register_); + assembler->ReadStackPointerFromRegister(stack_pointer_register_); + if (clear_capture_count_ > 0) { + // Clear any captures that might have been performed during the success + // of the body of the negative look-ahead. + int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; + assembler->ClearRegisters(clear_capture_start_, clear_capture_end); + } + // Now that we have unwound the stack we find at the top of the stack the + // backtrack that the BeginNegativeSubmatch node got. + assembler->Backtrack(); +} + +void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { + if (!trace->is_trivial()) { + trace->Flush(compiler, this); + return; + } + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + if (!label()->is_bound()) { + assembler->Bind(label()); + } + switch (action_) { + case ACCEPT: + assembler->Succeed(); + return; + case BACKTRACK: + assembler->GoTo(trace->backtrack()); + return; + case NEGATIVE_SUBMATCH_SUCCESS: + // This case is handled in a different virtual method. + UNREACHABLE(); + } + UNIMPLEMENTED(); +} + +void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { + if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone); + guards_->Add(guard, zone); +} + +ActionNode* ActionNode::SetRegisterForLoop(int reg, int val, + RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success); + result->data_.u_store_register.reg = reg; + result->data_.u_store_register.value = val; + return result; +} + +ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success); + result->data_.u_increment_register.reg = reg; + return result; +} + +ActionNode* ActionNode::StorePosition(int reg, bool is_capture, + RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(STORE_POSITION, on_success); + result->data_.u_position_register.reg = reg; + result->data_.u_position_register.is_capture = is_capture; + return result; +} + +ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success); + result->data_.u_clear_captures.range_from = range.from(); + result->data_.u_clear_captures.range_to = range.to(); + return result; +} + +ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg, + RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success); + result->data_.u_submatch.stack_pointer_register = stack_reg; + result->data_.u_submatch.current_position_register = position_reg; + return result; +} + +ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg, + RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success); + result->data_.u_submatch.stack_pointer_register = stack_reg; + result->data_.u_submatch.current_position_register = position_reg; + return result; +} + +ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg, + int clear_register_count, + int clear_register_from, + RegExpNode* on_success) { + ActionNode* result = on_success->zone()->New<ActionNode>( + POSITIVE_SUBMATCH_SUCCESS, on_success); + result->data_.u_submatch.stack_pointer_register = stack_reg; + result->data_.u_submatch.current_position_register = position_reg; + result->data_.u_submatch.clear_register_count = clear_register_count; + result->data_.u_submatch.clear_register_from = clear_register_from; + return result; +} + +ActionNode* ActionNode::EmptyMatchCheck(int start_register, + int repetition_register, + int repetition_limit, + RegExpNode* on_success) { + ActionNode* result = + on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success); + result->data_.u_empty_match_check.start_register = start_register; + result->data_.u_empty_match_check.repetition_register = repetition_register; + result->data_.u_empty_match_check.repetition_limit = repetition_limit; + return result; +} + +#define DEFINE_ACCEPT(Type) \ + void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); } +FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) +#undef DEFINE_ACCEPT + +// ------------------------------------------------------------------- +// Emit code. + +void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, + Guard* guard, Trace* trace) { + switch (guard->op()) { + case Guard::LT: + DCHECK(!trace->mentions_reg(guard->reg())); + macro_assembler->IfRegisterGE(guard->reg(), guard->value(), + trace->backtrack()); + break; + case Guard::GEQ: + DCHECK(!trace->mentions_reg(guard->reg())); + macro_assembler->IfRegisterLT(guard->reg(), guard->value(), + trace->backtrack()); + break; + } +} + +namespace { + +#ifdef DEBUG +bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { + static_assert(sizeof(unibrow::uchar) == 4); + for (int i = 0; i < length; i++) { + if (chars[i] > String::kMaxUtf16CodeUnit) return false; + } + return true; +} +#endif // DEBUG + +// Returns the number of characters in the equivalence class, omitting those +// that cannot occur in the source string because it is Latin1. +int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character, + bool one_byte_subject, unibrow::uchar* letters, + int letter_length) { +#ifdef V8_INTL_SUPPORT + if (RegExpCaseFolding::IgnoreSet().contains(character)) { + letters[0] = character; + DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1)); + return 1; + } + bool in_special_add_set = + RegExpCaseFolding::SpecialAddSet().contains(character); + + icu::UnicodeSet set; + set.add(character); + set = set.closeOver(USET_CASE_INSENSITIVE); + + UChar32 canon = 0; + if (in_special_add_set) { + canon = RegExpCaseFolding::Canonicalize(character); + } + + int32_t range_count = set.getRangeCount(); + int items = 0; + for (int32_t i = 0; i < range_count; i++) { + UChar32 start = set.getRangeStart(i); + UChar32 end = set.getRangeEnd(i); + CHECK(end - start + items <= letter_length); + for (UChar32 cu = start; cu <= end; cu++) { + if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; + if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { + continue; + } + letters[items++] = static_cast<unibrow::uchar>(cu); + } + } + DCHECK(ContainsOnlyUtf16CodeUnits(letters, items)); + return items; +#else + int length = + isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); + // Unibrow returns 0 or 1 for characters where case independence is + // trivial. + if (length == 0) { + letters[0] = character; + length = 1; + } + + if (one_byte_subject) { + int new_length = 0; + for (int i = 0; i < length; i++) { + if (letters[i] <= String::kMaxOneByteCharCode) { + letters[new_length++] = letters[i]; + } + } + length = new_length; + } + + DCHECK(ContainsOnlyUtf16CodeUnits(letters, length)); + return length; +#endif // V8_INTL_SUPPORT +} + +inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler, + base::uc16 c, Label* on_failure, int cp_offset, + bool check, bool preloaded) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + bool bound_checked = false; + if (!preloaded) { + assembler->LoadCurrentCharacter(cp_offset, on_failure, check); + bound_checked = true; + } + assembler->CheckNotCharacter(c, on_failure); + return bound_checked; +} + +// Only emits non-letters (things that don't have case). Only used for case +// independent matches. +inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, + base::uc16 c, Label* on_failure, int cp_offset, + bool check, bool preloaded) { + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + bool one_byte = compiler->one_byte(); + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); + if (length < 1) { + // This can't match. Must be an one-byte subject and a non-one-byte + // character. We do not need to do anything since the one-byte pass + // already handled this. + return false; // Bounds not checked. + } + bool checked = false; + // We handle the length > 1 case in a later pass. + if (length == 1) { + if (one_byte && c > String::kMaxOneByteCharCodeU) { + // Can't match - see above. + return false; // Bounds not checked. + } + if (!preloaded) { + macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); + checked = check; + } + macro_assembler->CheckNotCharacter(c, on_failure); + } + return checked; +} + +bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, + bool one_byte, base::uc16 c1, base::uc16 c2, + Label* on_failure) { + const uint32_t char_mask = CharMask(one_byte); + base::uc16 exor = c1 ^ c2; + // Check whether exor has only one bit set. + if (((exor - 1) & exor) == 0) { + // If c1 and c2 differ only by one bit. + // Ecma262UnCanonicalize always gives the highest number last. + DCHECK(c2 > c1); + base::uc16 mask = char_mask ^ exor; + macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); + return true; + } + DCHECK(c2 > c1); + base::uc16 diff = c2 - c1; + if (((diff - 1) & diff) == 0 && c1 >= diff) { + // If the characters differ by 2^n but don't differ by one bit then + // subtract the difference from the found character, then do the or + // trick. We avoid the theoretical case where negative numbers are + // involved in order to simplify code generation. + base::uc16 mask = char_mask ^ diff; + macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask, + on_failure); + return true; + } + return false; +} + +// Only emits letters (things that have case). Only used for case independent +// matches. +inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, + base::uc16 c, Label* on_failure, int cp_offset, + bool check, bool preloaded) { + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + bool one_byte = compiler->one_byte(); + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); + if (length <= 1) return false; + // We may not need to check against the end of the input string + // if this character lies before a character that matched. + if (!preloaded) { + macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); + } + Label ok; + switch (length) { + case 2: { + if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], + chars[1], on_failure)) { + } else { + macro_assembler->CheckCharacter(chars[0], &ok); + macro_assembler->CheckNotCharacter(chars[1], on_failure); + macro_assembler->Bind(&ok); + } + break; + } + case 4: + macro_assembler->CheckCharacter(chars[3], &ok); + V8_FALLTHROUGH; + case 3: + macro_assembler->CheckCharacter(chars[0], &ok); + macro_assembler->CheckCharacter(chars[1], &ok); + macro_assembler->CheckNotCharacter(chars[2], on_failure); + macro_assembler->Bind(&ok); + break; + default: + UNREACHABLE(); + } + return true; +} + +void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, + Label* fall_through, Label* above_or_equal, + Label* below) { + if (below != fall_through) { + masm->CheckCharacterLT(border, below); + if (above_or_equal != fall_through) masm->GoTo(above_or_equal); + } else { + masm->CheckCharacterGT(border - 1, above_or_equal); + } +} + +void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last, + Label* fall_through, Label* in_range, + Label* out_of_range) { + if (in_range == fall_through) { + if (first == last) { + masm->CheckNotCharacter(first, out_of_range); + } else { + masm->CheckCharacterNotInRange(first, last, out_of_range); + } + } else { + if (first == last) { + masm->CheckCharacter(first, in_range); + } else { + masm->CheckCharacterInRange(first, last, in_range); + } + if (out_of_range != fall_through) masm->GoTo(out_of_range); + } +} + +// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. +// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. +void EmitUseLookupTable(RegExpMacroAssembler* masm, + ZoneList<base::uc32>* ranges, uint32_t start_index, + uint32_t end_index, base::uc32 min_char, + Label* fall_through, Label* even_label, + Label* odd_label) { + static const uint32_t kSize = RegExpMacroAssembler::kTableSize; + static const uint32_t kMask = RegExpMacroAssembler::kTableMask; + + base::uc32 base = (min_char & ~kMask); + USE(base); + + // Assert that everything is on one kTableSize page. + for (uint32_t i = start_index; i <= end_index; i++) { + DCHECK_EQ(ranges->at(i) & ~kMask, base); + } + DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); + + char templ[kSize]; + Label* on_bit_set; + Label* on_bit_clear; + int bit; + if (even_label == fall_through) { + on_bit_set = odd_label; + on_bit_clear = even_label; + bit = 1; + } else { + on_bit_set = even_label; + on_bit_clear = odd_label; + bit = 0; + } + for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; + i++) { + templ[i] = bit; + } + uint32_t j = 0; + bit ^= 1; + for (uint32_t i = start_index; i < end_index; i++) { + for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { + templ[j] = bit; + } + bit ^= 1; + } + for (uint32_t i = j; i < kSize; i++) { + templ[i] = bit; + } + Factory* factory = masm->isolate()->factory(); + // TODO(erikcorry): Cache these. + Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld); + for (uint32_t i = 0; i < kSize; i++) { + ba->set(i, templ[i]); + } + masm->CheckBitInTable(ba, on_bit_set); + if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); +} + +void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges, + uint32_t start_index, uint32_t end_index, uint32_t cut_index, + Label* even_label, Label* odd_label) { + bool odd = (((cut_index - start_index) & 1) == 1); + Label* in_range_label = odd ? odd_label : even_label; + Label dummy; + EmitDoubleBoundaryTest(masm, ranges->at(cut_index), + ranges->at(cut_index + 1) - 1, &dummy, in_range_label, + &dummy); + DCHECK(!dummy.is_linked()); + // Cut out the single range by rewriting the array. This creates a new + // range that is a merger of the two ranges on either side of the one we + // are cutting out. The oddity of the labels is preserved. + for (uint32_t j = cut_index; j > start_index; j--) { + ranges->at(j) = ranges->at(j - 1); + } + for (uint32_t j = cut_index + 1; j < end_index; j++) { + ranges->at(j) = ranges->at(j + 1); + } +} + +// Unicode case. Split the search space into kSize spaces that are handled +// with recursion. +void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index, + uint32_t end_index, uint32_t* new_start_index, + uint32_t* new_end_index, base::uc32* border) { + static const uint32_t kSize = RegExpMacroAssembler::kTableSize; + static const uint32_t kMask = RegExpMacroAssembler::kTableMask; + + base::uc32 first = ranges->at(start_index); + base::uc32 last = ranges->at(end_index) - 1; + + *new_start_index = start_index; + *border = (ranges->at(start_index) & ~kMask) + kSize; + while (*new_start_index < end_index) { + if (ranges->at(*new_start_index) > *border) break; + (*new_start_index)++; + } + // new_start_index is the index of the first edge that is beyond the + // current kSize space. + + // For very large search spaces we do a binary chop search of the non-Latin1 + // space instead of just going to the end of the current kSize space. The + // heuristics are complicated a little by the fact that any 128-character + // encoding space can be quickly tested with a table lookup, so we don't + // wish to do binary chop search at a smaller granularity than that. A + // 128-character space can take up a lot of space in the ranges array if, + // for example, we only want to match every second character (eg. the lower + // case characters on some Unicode pages). + uint32_t binary_chop_index = (end_index + start_index) / 2; + // The first test ensures that we get to the code that handles the Latin1 + // range with a single not-taken branch, speeding up this important + // character range (even non-Latin1 charset-based text has spaces and + // punctuation). + if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. + end_index - start_index > (*new_start_index - start_index) * 2 && + last - first > kSize * 2 && binary_chop_index > *new_start_index && + ranges->at(binary_chop_index) >= first + 2 * kSize) { + uint32_t scan_forward_for_section_border = binary_chop_index; + uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1; + + while (scan_forward_for_section_border < end_index) { + if (ranges->at(scan_forward_for_section_border) > new_border) { + *new_start_index = scan_forward_for_section_border; + *border = new_border; + break; + } + scan_forward_for_section_border++; + } + } + + DCHECK(*new_start_index > start_index); + *new_end_index = *new_start_index - 1; + if (ranges->at(*new_end_index) == *border) { + (*new_end_index)--; + } + if (*border >= ranges->at(end_index)) { + *border = ranges->at(end_index); + *new_start_index = end_index; // Won't be used. + *new_end_index = end_index - 1; + } +} + +// Gets a series of segment boundaries representing a character class. If the +// character is in the range between an even and an odd boundary (counting from +// start_index) then go to even_label, otherwise go to odd_label. We already +// know that the character is in the range of min_char to max_char inclusive. +// Either label can be nullptr indicating backtracking. Either label can also +// be equal to the fall_through label. +void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges, + uint32_t start_index, uint32_t end_index, + base::uc32 min_char, base::uc32 max_char, + Label* fall_through, Label* even_label, + Label* odd_label) { + DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); + DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); + + base::uc32 first = ranges->at(start_index); + base::uc32 last = ranges->at(end_index) - 1; + + DCHECK_LT(min_char, first); + + // Just need to test if the character is before or on-or-after + // a particular character. + if (start_index == end_index) { + EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); + return; + } + + // Another almost trivial case: There is one interval in the middle that is + // different from the end intervals. + if (start_index + 1 == end_index) { + EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label, + odd_label); + return; + } + + // It's not worth using table lookup if there are very few intervals in the + // character class. + if (end_index - start_index <= 6) { + // It is faster to test for individual characters, so we look for those + // first, then try arbitrary ranges in the second round. + static uint32_t kNoCutIndex = -1; + uint32_t cut = kNoCutIndex; + for (uint32_t i = start_index; i < end_index; i++) { + if (ranges->at(i) == ranges->at(i + 1) - 1) { + cut = i; + break; + } + } + if (cut == kNoCutIndex) cut = start_index; + CutOutRange(masm, ranges, start_index, end_index, cut, even_label, + odd_label); + DCHECK_GE(end_index - start_index, 2); + GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char, + max_char, fall_through, even_label, odd_label); + return; + } + + // If there are a lot of intervals in the regexp, then we will use tables to + // determine whether the character is inside or outside the character class. + static const int kBits = RegExpMacroAssembler::kTableSizeBits; + + if ((max_char >> kBits) == (min_char >> kBits)) { + EmitUseLookupTable(masm, ranges, start_index, end_index, min_char, + fall_through, even_label, odd_label); + return; + } + + if ((min_char >> kBits) != first >> kBits) { + masm->CheckCharacterLT(first, odd_label); + GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char, + fall_through, odd_label, even_label); + return; + } + + uint32_t new_start_index = 0; + uint32_t new_end_index = 0; + base::uc32 border = 0; + + SplitSearchSpace(ranges, start_index, end_index, &new_start_index, + &new_end_index, &border); + + Label handle_rest; + Label* above = &handle_rest; + if (border == last + 1) { + // We didn't find any section that started after the limit, so everything + // above the border is one of the terminal labels. + above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; + DCHECK(new_end_index == end_index - 1); + } + + DCHECK_LE(start_index, new_end_index); + DCHECK_LE(new_start_index, end_index); + DCHECK_LT(start_index, new_start_index); + DCHECK_LT(new_end_index, end_index); + DCHECK(new_end_index + 1 == new_start_index || + (new_end_index + 2 == new_start_index && + border == ranges->at(new_end_index + 1))); + DCHECK_LT(min_char, border - 1); + DCHECK_LT(border, max_char); + DCHECK_LT(ranges->at(new_end_index), border); + DCHECK(border < ranges->at(new_start_index) || + (border == ranges->at(new_start_index) && + new_start_index == end_index && new_end_index == end_index - 1 && + border == last + 1)); + DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); + + masm->CheckCharacterGT(border - 1, above); + Label dummy; + GenerateBranches(masm, ranges, start_index, new_end_index, min_char, + border - 1, &dummy, even_label, odd_label); + if (handle_rest.is_linked()) { + masm->Bind(&handle_rest); + bool flip = (new_start_index & 1) != (start_index & 1); + GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char, + &dummy, flip ? odd_label : even_label, + flip ? even_label : odd_label); + } +} + +void EmitClassRanges(RegExpMacroAssembler* macro_assembler, + RegExpClassRanges* cr, bool one_byte, Label* on_failure, + int cp_offset, bool check_offset, bool preloaded, + Zone* zone) { + ZoneList<CharacterRange>* ranges = cr->ranges(zone); + CharacterRange::Canonicalize(ranges); + + // Now that all processing (like case-insensitivity) is done, clamp the + // ranges to the set of ranges that may actually occur in the subject string. + if (one_byte) CharacterRange::ClampToOneByte(ranges); + + const int ranges_length = ranges->length(); + if (ranges_length == 0) { + if (!cr->is_negated()) { + macro_assembler->GoTo(on_failure); + } + if (check_offset) { + macro_assembler->CheckPosition(cp_offset, on_failure); + } + return; + } + + const base::uc32 max_char = MaxCodeUnit(one_byte); + if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) { + if (cr->is_negated()) { + macro_assembler->GoTo(on_failure); + } else { + // This is a common case hit by non-anchored expressions. + if (check_offset) { + macro_assembler->CheckPosition(cp_offset, on_failure); + } + } + return; + } + + if (!preloaded) { + macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); + } + + if (cr->is_standard(zone) && macro_assembler->CheckSpecialClassRanges( + cr->standard_type(), on_failure)) { + return; + } + + static constexpr int kMaxRangesForInlineBranchGeneration = 16; + if (ranges_length > kMaxRangesForInlineBranchGeneration) { + // For large range sets, emit a more compact instruction sequence to avoid + // a potentially problematic increase in code size. + // Note the flipped logic below (we check InRange if negated, NotInRange if + // not negated); this is necessary since the method falls through on + // failure whereas we want to fall through on success. + if (cr->is_negated()) { + if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) { + return; + } + } else { + if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) { + return; + } + } + } + + // Generate a flat list of range boundaries for consumption by + // GenerateBranches. See the comment on that function for how the list should + // be structured + ZoneList<base::uc32>* range_boundaries = + zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone); + + bool zeroth_entry_is_failure = !cr->is_negated(); + + for (int i = 0; i < ranges_length; i++) { + CharacterRange& range = ranges->at(i); + if (range.from() == 0) { + DCHECK_EQ(i, 0); + zeroth_entry_is_failure = !zeroth_entry_is_failure; + } else { + range_boundaries->Add(range.from(), zone); + } + // `+ 1` to convert from inclusive to exclusive `to`. + // [from, to] == [from, to+1[. + range_boundaries->Add(range.to() + 1, zone); + } + int end_index = range_boundaries->length() - 1; + if (range_boundaries->at(end_index) > max_char) { + end_index--; + } + + Label fall_through; + GenerateBranches(macro_assembler, range_boundaries, + 0, // start_index. + end_index, + 0, // min_char. + max_char, &fall_through, + zeroth_entry_is_failure ? &fall_through : on_failure, + zeroth_entry_is_failure ? on_failure : &fall_through); + macro_assembler->Bind(&fall_through); +} + +} // namespace + +RegExpNode::~RegExpNode() = default; + +RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, + Trace* trace) { + // If we are generating a greedy loop then don't stop and don't reuse code. + if (trace->stop_node() != nullptr) { + return CONTINUE; + } + + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + if (trace->is_trivial()) { + if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { + // If a generic version is already scheduled to be generated or we have + // recursed too deeply then just generate a jump to that code. + macro_assembler->GoTo(&label_); + // This will queue it up for generation of a generic version if it hasn't + // already been queued. + compiler->AddWork(this); + return DONE; + } + // Generate generic version of the node and bind the label for later use. + macro_assembler->Bind(&label_); + return CONTINUE; + } + + // We are being asked to make a non-generic version. Keep track of how many + // non-generic versions we generate so as not to overdo it. + trace_count_++; + if (KeepRecursing(compiler) && compiler->optimize() && + trace_count_ < kMaxCopiesCodeGenerated) { + return CONTINUE; + } + + // If we get here code has been generated for this node too many times or + // recursion is too deep. Time to switch to a generic version. The code for + // generic versions above can handle deep recursion properly. + bool was_limiting = compiler->limiting_recursion(); + compiler->set_limiting_recursion(true); + trace->Flush(compiler, this); + compiler->set_limiting_recursion(was_limiting); + return DONE; +} + +bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { + return !compiler->limiting_recursion() && + compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; +} + +void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) { + if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) { + // Anything may follow a positive submatch success, thus we need to accept + // all characters from this position onwards. + bm->SetRest(offset); + } else { + on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); + } + SaveBMInfo(bm, not_at_start, offset); +} + +void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int filled_in, + bool not_at_start) { + if (action_type_ == SET_REGISTER_FOR_LOOP) { + on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler, + filled_in, not_at_start); + } else { + on_success()->GetQuickCheckDetails(details, compiler, filled_in, + not_at_start); + } +} + +void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) { + // Match the behaviour of EatsAtLeast on this node. + if (assertion_type() == AT_START && not_at_start) return; + on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); + SaveBMInfo(bm, not_at_start, offset); +} + +void NegativeLookaroundChoiceNode::GetQuickCheckDetails( + QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, + bool not_at_start) { + RegExpNode* node = continue_node(); + return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); +} + +namespace { + +// Takes the left-most 1-bit and smears it out, setting all bits to its right. +inline uint32_t SmearBitsRight(uint32_t v) { + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v; +} + +} // namespace + +bool QuickCheckDetails::Rationalize(bool asc) { + bool found_useful_op = false; + const uint32_t char_mask = CharMask(asc); + mask_ = 0; + value_ = 0; + int char_shift = 0; + for (int i = 0; i < characters_; i++) { + Position* pos = &positions_[i]; + if ((pos->mask & String::kMaxOneByteCharCode) != 0) { + found_useful_op = true; + } + mask_ |= (pos->mask & char_mask) << char_shift; + value_ |= (pos->value & char_mask) << char_shift; + char_shift += asc ? 8 : 16; + } + return found_useful_op; +} + +int RegExpNode::EatsAtLeast(bool not_at_start) { + return not_at_start ? eats_at_least_.eats_at_least_from_not_start + : eats_at_least_.eats_at_least_from_possibly_start; +} + +EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() { + // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it + // implies that the following node must be a LoopChoiceNode. If we need to + // set registers to constant values for other reasons, we could introduce a + // new action type SET_REGISTER that doesn't imply anything about its + // successor. + UNREACHABLE(); +} + +void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start) { + // See comment in RegExpNode::EatsAtLeastFromLoopEntry. + UNREACHABLE(); +} + +EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() { + DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. + + if (read_backward()) { + // The eats_at_least value is not used if reading backward. The + // EatsAtLeastPropagator should've zeroed it as well. + DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0); + DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0); + return {}; + } + + // Figure out how much the loop body itself eats, not including anything in + // the continuation case. In general, the nodes in the loop body should report + // that they eat at least the number eaten by the continuation node, since any + // successful match in the loop body must also include the continuation node. + // However, in some cases involving positive lookaround, the loop body under- + // reports its appetite, so use saturated math here to avoid negative numbers. + uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>( + loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true)); + uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>( + loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true)); + + // Limit the number of loop iterations to avoid overflow in subsequent steps. + int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations()); + + EatsAtLeastInfo result; + result.eats_at_least_from_not_start = + base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start + + continue_node_->EatsAtLeast(true)); + if (loop_iterations > 0 && loop_body_from_possibly_start > 0) { + // First loop iteration eats at least one, so all subsequent iterations + // and the after-loop chunk are guaranteed to not be at the start. + result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>( + loop_body_from_possibly_start + + (loop_iterations - 1) * loop_body_from_not_start + + continue_node_->EatsAtLeast(true)); + } else { + // Loop body might eat nothing, so only continue node contributes. + result.eats_at_least_from_possibly_start = + continue_node_->EatsAtLeast(false); + } + return result; +} + +bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, + Trace* bounds_check_trace, Trace* trace, + bool preload_has_checked_bounds, + Label* on_possible_success, + QuickCheckDetails* details, + bool fall_through_on_failure, + ChoiceNode* predecessor) { + DCHECK_NOT_NULL(predecessor); + if (details->characters() == 0) return false; + GetQuickCheckDetails(details, compiler, 0, + trace->at_start() == Trace::FALSE_VALUE); + if (details->cannot_match()) return false; + if (!details->Rationalize(compiler->one_byte())) return false; + DCHECK(details->characters() == 1 || + compiler->macro_assembler()->CanReadUnaligned()); + uint32_t mask = details->mask(); + uint32_t value = details->value(); + + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + + if (trace->characters_preloaded() != details->characters()) { + DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); + // The bounds check is performed using the minimum number of characters + // any choice would eat, so if the bounds check fails, then none of the + // choices can succeed, so we can just immediately backtrack, rather + // than go to the next choice. The number of characters preloaded may be + // less than the number used for the bounds check. + int eats_at_least = predecessor->EatsAtLeast( + bounds_check_trace->at_start() == Trace::FALSE_VALUE); + DCHECK_GE(eats_at_least, details->characters()); + assembler->LoadCurrentCharacter( + trace->cp_offset(), bounds_check_trace->backtrack(), + !preload_has_checked_bounds, details->characters(), eats_at_least); + } + + bool need_mask = true; + + if (details->characters() == 1) { + // If number of characters preloaded is 1 then we used a byte or 16 bit + // load so the value is already masked down. + const uint32_t char_mask = CharMask(compiler->one_byte()); + if ((mask & char_mask) == char_mask) need_mask = false; + mask &= char_mask; + } else { + // For 2-character preloads in one-byte mode or 1-character preloads in + // two-byte mode we also use a 16 bit load with zero extend. + static const uint32_t kTwoByteMask = 0xFFFF; + static const uint32_t kFourByteMask = 0xFFFFFFFF; + if (details->characters() == 2 && compiler->one_byte()) { + if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; + } else if (details->characters() == 1 && !compiler->one_byte()) { + if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; + } else { + if (mask == kFourByteMask) need_mask = false; + } + } + + if (fall_through_on_failure) { + if (need_mask) { + assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); + } else { + assembler->CheckCharacter(value, on_possible_success); + } + } else { + if (need_mask) { + assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); + } else { + assembler->CheckNotCharacter(value, trace->backtrack()); + } + } + return true; +} + +// Here is the meat of GetQuickCheckDetails (see also the comment on the +// super-class in the .h file). +// +// We iterate along the text object, building up for each character a +// mask and value that can be used to test for a quick failure to match. +// The masks and values for the positions will be combined into a single +// machine word for the current character width in order to be used in +// generating a quick check. +void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start) { + // Do not collect any quick check details if the text node reads backward, + // since it reads in the opposite direction than we use for quick checks. + if (read_backward()) return; + Isolate* isolate = compiler->macro_assembler()->isolate(); + DCHECK(characters_filled_in < details->characters()); + int characters = details->characters(); + const uint32_t char_mask = CharMask(compiler->one_byte()); + for (int k = 0; k < elements()->length(); k++) { + TextElement elm = elements()->at(k); + if (elm.text_type() == TextElement::ATOM) { + base::Vector<const base::uc16> quarks = elm.atom()->data(); + for (int i = 0; i < characters && i < quarks.length(); i++) { + QuickCheckDetails::Position* pos = + details->positions(characters_filled_in); + base::uc16 c = quarks[i]; + if (IsIgnoreCase(compiler->flags())) { + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters( + isolate, c, compiler->one_byte(), chars, 4); + if (length == 0) { + // This can happen because all case variants are non-Latin1, but we + // know the input is Latin1. + details->set_cannot_match(); + pos->determines_perfectly = false; + return; + } + if (length == 1) { + // This letter has no case equivalents, so it's nice and simple + // and the mask-compare will determine definitely whether we have + // a match at this character position. + pos->mask = char_mask; + pos->value = chars[0]; + pos->determines_perfectly = true; + } else { + uint32_t common_bits = char_mask; + uint32_t bits = chars[0]; + for (int j = 1; j < length; j++) { + uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); + common_bits ^= differing_bits; + bits &= common_bits; + } + // If length is 2 and common bits has only one zero in it then + // our mask and compare instruction will determine definitely + // whether we have a match at this character position. Otherwise + // it can only be an approximate check. + uint32_t one_zero = (common_bits | ~char_mask); + if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { + pos->determines_perfectly = true; + } + pos->mask = common_bits; + pos->value = bits; + } + } else { + // Don't ignore case. Nice simple case where the mask-compare will + // determine definitely whether we have a match at this character + // position. + if (c > char_mask) { + details->set_cannot_match(); + pos->determines_perfectly = false; + return; + } + pos->mask = char_mask; + pos->value = c; + pos->determines_perfectly = true; + } + characters_filled_in++; + DCHECK(characters_filled_in <= details->characters()); + if (characters_filled_in == details->characters()) { + return; + } + } + } else { + QuickCheckDetails::Position* pos = + details->positions(characters_filled_in); + RegExpClassRanges* tree = elm.class_ranges(); + ZoneList<CharacterRange>* ranges = tree->ranges(zone()); + if (tree->is_negated() || ranges->is_empty()) { + // A quick check uses multi-character mask and compare. There is no + // useful way to incorporate a negative char class into this scheme + // so we just conservatively create a mask and value that will always + // succeed. + // Likewise for empty ranges (empty ranges can occur e.g. when + // compiling for one-byte subjects and impossible (non-one-byte) ranges + // have been removed). + pos->mask = 0; + pos->value = 0; + } else { + int first_range = 0; + while (ranges->at(first_range).from() > char_mask) { + first_range++; + if (first_range == ranges->length()) { + details->set_cannot_match(); + pos->determines_perfectly = false; + return; + } + } + CharacterRange range = ranges->at(first_range); + const base::uc32 first_from = range.from(); + const base::uc32 first_to = + (range.to() > char_mask) ? char_mask : range.to(); + const uint32_t differing_bits = (first_from ^ first_to); + // A mask and compare is only perfect if the differing bits form a + // number like 00011111 with one single block of trailing 1s. + if ((differing_bits & (differing_bits + 1)) == 0 && + first_from + differing_bits == first_to) { + pos->determines_perfectly = true; + } + uint32_t common_bits = ~SmearBitsRight(differing_bits); + uint32_t bits = (first_from & common_bits); + for (int i = first_range + 1; i < ranges->length(); i++) { + range = ranges->at(i); + const base::uc32 from = range.from(); + if (from > char_mask) continue; + const base::uc32 to = + (range.to() > char_mask) ? char_mask : range.to(); + // Here we are combining more ranges into the mask and compare + // value. With each new range the mask becomes more sparse and + // so the chances of a false positive rise. A character class + // with multiple ranges is assumed never to be equivalent to a + // mask and compare operation. + pos->determines_perfectly = false; + uint32_t new_common_bits = (from ^ to); + new_common_bits = ~SmearBitsRight(new_common_bits); + common_bits &= new_common_bits; + bits &= new_common_bits; + uint32_t new_differing_bits = (from & common_bits) ^ bits; + common_bits ^= new_differing_bits; + bits &= common_bits; + } + pos->mask = common_bits; + pos->value = bits; + } + characters_filled_in++; + DCHECK(characters_filled_in <= details->characters()); + if (characters_filled_in == details->characters()) return; + } + } + DCHECK(characters_filled_in != details->characters()); + if (!details->cannot_match()) { + on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in, + true); + } +} + +void QuickCheckDetails::Clear() { + for (int i = 0; i < characters_; i++) { + positions_[i].mask = 0; + positions_[i].value = 0; + positions_[i].determines_perfectly = false; + } + characters_ = 0; +} + +void QuickCheckDetails::Advance(int by, bool one_byte) { + if (by >= characters_ || by < 0) { + DCHECK_IMPLIES(by < 0, characters_ == 0); + Clear(); + return; + } + DCHECK_LE(characters_ - by, 4); + DCHECK_LE(characters_, 4); + for (int i = 0; i < characters_ - by; i++) { + positions_[i] = positions_[by + i]; + } + for (int i = characters_ - by; i < characters_; i++) { + positions_[i].mask = 0; + positions_[i].value = 0; + positions_[i].determines_perfectly = false; + } + characters_ -= by; + // We could change mask_ and value_ here but we would never advance unless + // they had already been used in a check and they won't be used again because + // it would gain us nothing. So there's no point. +} + +void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { + DCHECK(characters_ == other->characters_); + if (other->cannot_match_) { + return; + } + if (cannot_match_) { + *this = *other; + return; + } + for (int i = from_index; i < characters_; i++) { + QuickCheckDetails::Position* pos = positions(i); + QuickCheckDetails::Position* other_pos = other->positions(i); + if (pos->mask != other_pos->mask || pos->value != other_pos->value || + !other_pos->determines_perfectly) { + // Our mask-compare operation will be approximate unless we have the + // exact same operation on both sides of the alternation. + pos->determines_perfectly = false; + } + pos->mask &= other_pos->mask; + pos->value &= pos->mask; + other_pos->value &= pos->mask; + uint32_t differing_bits = (pos->value ^ other_pos->value); + pos->mask &= ~differing_bits; + pos->value &= pos->mask; + } +} + +class VisitMarker { + public: + explicit VisitMarker(NodeInfo* info) : info_(info) { + DCHECK(!info->visited); + info->visited = true; + } + ~VisitMarker() { info_->visited = false; } + + private: + NodeInfo* info_; +}; + +// Temporarily sets traversed_loop_initialization_node_. +class LoopInitializationMarker { + public: + explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) { + DCHECK(!node_->traversed_loop_initialization_node_); + node_->traversed_loop_initialization_node_ = true; + } + ~LoopInitializationMarker() { + DCHECK(node_->traversed_loop_initialization_node_); + node_->traversed_loop_initialization_node_ = false; + } + LoopInitializationMarker(const LoopInitializationMarker&) = delete; + LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete; + + private: + LoopChoiceNode* node_; +}; + +// Temporarily decrements min_loop_iterations_. +class IterationDecrementer { + public: + explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) { + DCHECK_GT(node_->min_loop_iterations_, 0); + --node_->min_loop_iterations_; + } + ~IterationDecrementer() { ++node_->min_loop_iterations_; } + IterationDecrementer(const IterationDecrementer&) = delete; + IterationDecrementer& operator=(const IterationDecrementer&) = delete; + + private: + LoopChoiceNode* node_; +}; + +RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) { + if (info()->replacement_calculated) return replacement(); + if (depth < 0) return this; + DCHECK(!info()->visited); + VisitMarker marker(info()); + return FilterSuccessor(depth - 1, flags); +} + +RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) { + RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags); + if (next == nullptr) return set_replacement(nullptr); + on_success_ = next; + return set_replacement(this); +} + +// We need to check for the following characters: 0x39C 0x3BC 0x178. +bool RangeContainsLatin1Equivalents(CharacterRange range) { + // TODO(dcarney): this could be a lot more efficient. + return range.Contains(0x039C) || range.Contains(0x03BC) || + range.Contains(0x0178); +} + +namespace { + +bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { + for (int i = 0; i < ranges->length(); i++) { + // TODO(dcarney): this could be a lot more efficient. + if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; + } + return false; +} + +} // namespace + +RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) { + if (info()->replacement_calculated) return replacement(); + if (depth < 0) return this; + DCHECK(!info()->visited); + VisitMarker marker(info()); + int element_count = elements()->length(); + for (int i = 0; i < element_count; i++) { + TextElement elm = elements()->at(i); + if (elm.text_type() == TextElement::ATOM) { + base::Vector<const base::uc16> quarks = elm.atom()->data(); + for (int j = 0; j < quarks.length(); j++) { + base::uc16 c = quarks[j]; + if (IsIgnoreCase(flags)) { + c = unibrow::Latin1::TryConvertToLatin1(c); + } + if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); + // Replace quark in case we converted to Latin-1. + base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin()); + writable_quarks[j] = c; + } + } else { + DCHECK(elm.text_type() == TextElement::CLASS_RANGES); + RegExpClassRanges* cr = elm.class_ranges(); + ZoneList<CharacterRange>* ranges = cr->ranges(zone()); + CharacterRange::Canonicalize(ranges); + // Now they are in order so we only need to look at the first. + int range_count = ranges->length(); + if (cr->is_negated()) { + if (range_count != 0 && ranges->at(0).from() == 0 && + ranges->at(0).to() >= String::kMaxOneByteCharCode) { + // This will be handled in a later filter. + if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) { + continue; + } + return set_replacement(nullptr); + } + } else { + if (range_count == 0 || + ranges->at(0).from() > String::kMaxOneByteCharCode) { + // This will be handled in a later filter. + if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) { + continue; + } + return set_replacement(nullptr); + } + } + } + } + return FilterSuccessor(depth - 1, flags); +} + +RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) { + if (info()->replacement_calculated) return replacement(); + if (depth < 0) return this; + if (info()->visited) return this; + { + VisitMarker marker(info()); + + RegExpNode* continue_replacement = + continue_node_->FilterOneByte(depth - 1, flags); + // If we can't continue after the loop then there is no sense in doing the + // loop. + if (continue_replacement == nullptr) return set_replacement(nullptr); + } + + return ChoiceNode::FilterOneByte(depth - 1, flags); +} + +RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) { + if (info()->replacement_calculated) return replacement(); + if (depth < 0) return this; + if (info()->visited) return this; + VisitMarker marker(info()); + int choice_count = alternatives_->length(); + + for (int i = 0; i < choice_count; i++) { + GuardedAlternative alternative = alternatives_->at(i); + if (alternative.guards() != nullptr && + alternative.guards()->length() != 0) { + set_replacement(this); + return this; + } + } + + int surviving = 0; + RegExpNode* survivor = nullptr; + for (int i = 0; i < choice_count; i++) { + GuardedAlternative alternative = alternatives_->at(i); + RegExpNode* replacement = + alternative.node()->FilterOneByte(depth - 1, flags); + DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. + if (replacement != nullptr) { + alternatives_->at(i).set_node(replacement); + surviving++; + survivor = replacement; + } + } + if (surviving < 2) return set_replacement(survivor); + + set_replacement(this); + if (surviving == choice_count) { + return this; + } + // Only some of the nodes survived the filtering. We need to rebuild the + // alternatives list. + ZoneList<GuardedAlternative>* new_alternatives = + zone()->New<ZoneList<GuardedAlternative>>(surviving, zone()); + for (int i = 0; i < choice_count; i++) { + RegExpNode* replacement = + alternatives_->at(i).node()->FilterOneByte(depth - 1, flags); + if (replacement != nullptr) { + alternatives_->at(i).set_node(replacement); + new_alternatives->Add(alternatives_->at(i), zone()); + } + } + alternatives_ = new_alternatives; + return this; +} + +RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, + RegExpFlags flags) { + if (info()->replacement_calculated) return replacement(); + if (depth < 0) return this; + if (info()->visited) return this; + VisitMarker marker(info()); + // Alternative 0 is the negative lookahead, alternative 1 is what comes + // afterwards. + RegExpNode* node = continue_node(); + RegExpNode* replacement = node->FilterOneByte(depth - 1, flags); + if (replacement == nullptr) return set_replacement(nullptr); + alternatives_->at(kContinueIndex).set_node(replacement); + + RegExpNode* neg_node = lookaround_node(); + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags); + // If the negative lookahead is always going to fail then + // we don't need to check it. + if (neg_replacement == nullptr) return set_replacement(replacement); + alternatives_->at(kLookaroundIndex).set_node(neg_replacement); + return set_replacement(this); +} + +void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start) { + if (body_can_be_zero_length_ || info()->visited) return; + not_at_start = not_at_start || this->not_at_start(); + DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. + if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 && + loop_node_->EatsAtLeast(not_at_start) > + continue_node_->EatsAtLeast(true)) { + // Loop body is guaranteed to execute at least once, and consume characters + // when it does, meaning the only possible quick checks from this point + // begin with the loop body. We may recursively visit this LoopChoiceNode, + // but we temporarily decrease its minimum iteration counter so we know when + // to check the continue case. + IterationDecrementer next_iteration(this); + loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in, + not_at_start); + } else { + // Might not consume anything in the loop body, so treat it like a normal + // ChoiceNode (and don't recursively visit this node again). + VisitMarker marker(info()); + ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in, + not_at_start); + } +} + +void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry( + QuickCheckDetails* details, RegExpCompiler* compiler, + int characters_filled_in, bool not_at_start) { + if (traversed_loop_initialization_node_) { + // We already entered this loop once, exited via its continuation node, and + // followed an outer loop's back-edge to before the loop entry point. We + // could try to reset the minimum iteration count to its starting value at + // this point, but that seems like more trouble than it's worth. It's safe + // to keep going with the current (possibly reduced) minimum iteration + // count. + GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); + } else { + // We are entering a loop via its counter initialization action, meaning we + // are guaranteed to run the loop body at least some minimum number of times + // before running the continuation node. Set a flag so that this node knows + // (now and any times we visit it again recursively) that it was entered + // from the top. + LoopInitializationMarker marker(this); + GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); + } +} + +void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) { + if (body_can_be_zero_length_ || budget <= 0) { + bm->SetRest(offset); + SaveBMInfo(bm, not_at_start, offset); + return; + } + ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); + SaveBMInfo(bm, not_at_start, offset); +} + +void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start) { + not_at_start = (not_at_start || not_at_start_); + int choice_count = alternatives_->length(); + DCHECK_LT(0, choice_count); + alternatives_->at(0).node()->GetQuickCheckDetails( + details, compiler, characters_filled_in, not_at_start); + for (int i = 1; i < choice_count; i++) { + QuickCheckDetails new_details(details->characters()); + RegExpNode* node = alternatives_->at(i).node(); + node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in, + not_at_start); + // Here we merge the quick match details of the two branches. + details->Merge(&new_details, characters_filled_in); + } +} + +namespace { + +// Check for [0-9A-Z_a-z]. +void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word, + Label* non_word, bool fall_through_on_word) { + if (assembler->CheckSpecialClassRanges( + fall_through_on_word ? StandardCharacterSet::kWord + : StandardCharacterSet::kNotWord, + fall_through_on_word ? non_word : word)) { + // Optimized implementation available. + return; + } + assembler->CheckCharacterGT('z', non_word); + assembler->CheckCharacterLT('0', non_word); + assembler->CheckCharacterGT('a' - 1, word); + assembler->CheckCharacterLT('9' + 1, word); + assembler->CheckCharacterLT('A', non_word); + assembler->CheckCharacterLT('Z' + 1, word); + if (fall_through_on_word) { + assembler->CheckNotCharacter('_', non_word); + } else { + assembler->CheckCharacter('_', word); + } +} + +// Emit the code to check for a ^ in multiline mode (1-character lookbehind +// that matches newline or the start of input). +void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + + // We will load the previous character into the current character register. + Trace new_trace(*trace); + new_trace.InvalidateCurrentCharacter(); + + // A positive (> 0) cp_offset means we've already successfully matched a + // non-empty-width part of the pattern, and thus cannot be at or before the + // start of the subject string. We can thus skip both at-start and + // bounds-checks when loading the one-character lookbehind. + const bool may_be_at_or_before_subject_string_start = + new_trace.cp_offset() <= 0; + + Label ok; + if (may_be_at_or_before_subject_string_start) { + // The start of input counts as a newline in this context, so skip to ok if + // we are at the start. + assembler->CheckAtStart(new_trace.cp_offset(), &ok); + } + + // If we've already checked that we are not at the start of input, it's okay + // to load the previous character without bounds checks. + const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; + assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, + new_trace.backtrack(), can_skip_bounds_check); + if (!assembler->CheckSpecialClassRanges(StandardCharacterSet::kLineTerminator, + new_trace.backtrack())) { + // Newline means \n, \r, 0x2028 or 0x2029. + if (!compiler->one_byte()) { + assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok); + } + assembler->CheckCharacter('\n', &ok); + assembler->CheckNotCharacter('\r', new_trace.backtrack()); + } + assembler->Bind(&ok); + on_success->Emit(compiler, &new_trace); +} + +} // namespace + +// Emit the code to handle \b and \B (word-boundary or non-word-boundary). +void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + Isolate* isolate = assembler->isolate(); + Trace::TriBool next_is_word_character = Trace::UNKNOWN; + bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); + BoyerMooreLookahead* lookahead = bm_info(not_at_start); + if (lookahead == nullptr) { + int eats_at_least = + std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start)); + if (eats_at_least >= 1) { + BoyerMooreLookahead* bm = + zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone()); + FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); + if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE; + if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; + } + } else { + if (lookahead->at(0)->is_non_word()) + next_is_word_character = Trace::FALSE_VALUE; + if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; + } + bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); + if (next_is_word_character == Trace::UNKNOWN) { + Label before_non_word; + Label before_word; + if (trace->characters_preloaded() != 1) { + assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); + } + // Fall through on non-word. + EmitWordCheck(assembler, &before_word, &before_non_word, false); + // Next character is not a word character. + assembler->Bind(&before_non_word); + Label ok; + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); + assembler->GoTo(&ok); + + assembler->Bind(&before_word); + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); + assembler->Bind(&ok); + } else if (next_is_word_character == Trace::TRUE_VALUE) { + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); + } else { + DCHECK(next_is_word_character == Trace::FALSE_VALUE); + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); + } +} + +void AssertionNode::BacktrackIfPrevious( + RegExpCompiler* compiler, Trace* trace, + AssertionNode::IfPrevious backtrack_if_previous) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + Trace new_trace(*trace); + new_trace.InvalidateCurrentCharacter(); + + Label fall_through; + Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack() + : &fall_through; + Label* word = backtrack_if_previous == kIsNonWord ? &fall_through + : new_trace.backtrack(); + + // A positive (> 0) cp_offset means we've already successfully matched a + // non-empty-width part of the pattern, and thus cannot be at or before the + // start of the subject string. We can thus skip both at-start and + // bounds-checks when loading the one-character lookbehind. + const bool may_be_at_or_before_subject_string_start = + new_trace.cp_offset() <= 0; + + if (may_be_at_or_before_subject_string_start) { + // The start of input counts as a non-word character, so the question is + // decided if we are at the start. + assembler->CheckAtStart(new_trace.cp_offset(), non_word); + } + + // If we've already checked that we are not at the start of input, it's okay + // to load the previous character without bounds checks. + const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; + assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word, + can_skip_bounds_check); + EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); + + assembler->Bind(&fall_through); + on_success()->Emit(compiler, &new_trace); +} + +void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, + int filled_in, bool not_at_start) { + if (assertion_type_ == AT_START && not_at_start) { + details->set_cannot_match(); + return; + } + return on_success()->GetQuickCheckDetails(details, compiler, filled_in, + not_at_start); +} + +void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + switch (assertion_type_) { + case AT_END: { + Label ok; + assembler->CheckPosition(trace->cp_offset(), &ok); + assembler->GoTo(trace->backtrack()); + assembler->Bind(&ok); + break; + } + case AT_START: { + if (trace->at_start() == Trace::FALSE_VALUE) { + assembler->GoTo(trace->backtrack()); + return; + } + if (trace->at_start() == Trace::UNKNOWN) { + assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); + Trace at_start_trace = *trace; + at_start_trace.set_at_start(Trace::TRUE_VALUE); + on_success()->Emit(compiler, &at_start_trace); + return; + } + } break; + case AFTER_NEWLINE: + EmitHat(compiler, on_success(), trace); + return; + case AT_BOUNDARY: + case AT_NON_BOUNDARY: { + EmitBoundaryCheck(compiler, trace); + return; + } + } + on_success()->Emit(compiler, trace); +} + +namespace { + +bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { + if (quick_check == nullptr) return false; + if (offset >= quick_check->characters()) return false; + return quick_check->positions(offset)->determines_perfectly; +} + +void UpdateBoundsCheck(int index, int* checked_up_to) { + if (index > *checked_up_to) { + *checked_up_to = index; + } +} + +} // namespace + +// We call this repeatedly to generate code for each pass over the text node. +// The passes are in increasing order of difficulty because we hope one +// of the first passes will fail in which case we are saved the work of the +// later passes. for example for the case independent regexp /%[asdfghjkl]a/ +// we will check the '%' in the first pass, the case independent 'a' in the +// second pass and the character class in the last pass. +// +// The passes are done from right to left, so for example to test for /bar/ +// we will first test for an 'r' with offset 2, then an 'a' with offset 1 +// and then a 'b' with offset 0. This means we can avoid the end-of-input +// bounds check most of the time. In the example we only need to check for +// end-of-input when loading the putative 'r'. +// +// A slight complication involves the fact that the first character may already +// be fetched into a register by the previous node. In this case we want to +// do the test for that character first. We do this in separate passes. The +// 'preloaded' argument indicates that we are doing such a 'pass'. If such a +// pass has been performed then subsequent passes will have true in +// first_element_checked to indicate that that character does not need to be +// checked again. +// +// In addition to all this we are passed a Trace, which can +// contain an AlternativeGeneration object. In this AlternativeGeneration +// object we can see details of any quick check that was already passed in +// order to get to the code we are now generating. The quick check can involve +// loading characters, which means we do not need to recheck the bounds +// up to the limit the quick check already checked. In addition the quick +// check can have involved a mask and compare operation which may simplify +// or obviate the need for further checks at some character positions. +void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, + bool preloaded, Trace* trace, + bool first_element_checked, int* checked_up_to) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + Isolate* isolate = assembler->isolate(); + bool one_byte = compiler->one_byte(); + Label* backtrack = trace->backtrack(); + QuickCheckDetails* quick_check = trace->quick_check_performed(); + int element_count = elements()->length(); + int backward_offset = read_backward() ? -Length() : 0; + for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { + TextElement elm = elements()->at(i); + int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; + if (elm.text_type() == TextElement::ATOM) { + if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue; + base::Vector<const base::uc16> quarks = elm.atom()->data(); + for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { + if (first_element_checked && i == 0 && j == 0) continue; + if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; + base::uc16 quark = quarks[j]; + if (IsIgnoreCase(compiler->flags())) { + // Everywhere else we assume that a non-Latin-1 character cannot match + // a Latin-1 character. Avoid the cases where this is assumption is + // invalid by using the Latin1 equivalent instead. + quark = unibrow::Latin1::TryConvertToLatin1(quark); + } + bool needs_bounds_check = + *checked_up_to < cp_offset + j || read_backward(); + bool bounds_checked = false; + switch (pass) { + case NON_LATIN1_MATCH: + DCHECK(one_byte); + if (quark > String::kMaxOneByteCharCode) { + assembler->GoTo(backtrack); + return; + } + break; + case NON_LETTER_CHARACTER_MATCH: + bounds_checked = + EmitAtomNonLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); + break; + case SIMPLE_CHARACTER_MATCH: + bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, + backtrack, cp_offset + j, + needs_bounds_check, preloaded); + break; + case CASE_CHARACTER_MATCH: + bounds_checked = + EmitAtomLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); + break; + default: + break; + } + if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); + } + } else { + DCHECK_EQ(TextElement::CLASS_RANGES, elm.text_type()); + if (pass == CHARACTER_CLASS_MATCH) { + if (first_element_checked && i == 0) continue; + if (DeterminedAlready(quick_check, elm.cp_offset())) continue; + RegExpClassRanges* cr = elm.class_ranges(); + bool bounds_check = *checked_up_to < cp_offset || read_backward(); + EmitClassRanges(assembler, cr, one_byte, backtrack, cp_offset, + bounds_check, preloaded, zone()); + UpdateBoundsCheck(cp_offset, checked_up_to); + } + } + } +} + +int TextNode::Length() { + TextElement elm = elements()->last(); + DCHECK_LE(0, elm.cp_offset()); + return elm.cp_offset() + elm.length(); +} + +bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { + if (ignore_case) { + return pass == SIMPLE_CHARACTER_MATCH; + } else { + return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; + } +} + +TextNode* TextNode::CreateForCharacterRanges(Zone* zone, + ZoneList<CharacterRange>* ranges, + bool read_backward, + RegExpNode* on_success) { + DCHECK_NOT_NULL(ranges); + // TODO(jgruber): There's no fundamental need to create this + // RegExpClassRanges; we could refactor to avoid the allocation. + return zone->New<TextNode>(zone->New<RegExpClassRanges>(zone, ranges), + read_backward, on_success); +} + +TextNode* TextNode::CreateForSurrogatePair( + Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges, + bool read_backward, RegExpNode* on_success) { + ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); + ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone); + elms->Add( + TextElement::ClassRanges(zone->New<RegExpClassRanges>(zone, lead_ranges)), + zone); + elms->Add(TextElement::ClassRanges( + zone->New<RegExpClassRanges>(zone, trail_ranges)), + zone); + return zone->New<TextNode>(elms, read_backward, on_success); +} + +TextNode* TextNode::CreateForSurrogatePair( + Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail, + bool read_backward, RegExpNode* on_success) { + ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); + ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone); + elms->Add( + TextElement::ClassRanges(zone->New<RegExpClassRanges>(zone, lead_ranges)), + zone); + elms->Add(TextElement::ClassRanges( + zone->New<RegExpClassRanges>(zone, trail_ranges)), + zone); + return zone->New<TextNode>(elms, read_backward, on_success); +} + +// This generates the code to match a text node. A text node can contain +// straight character sequences (possibly to be matched in a case-independent +// way) and character classes. For efficiency we do not do this in a single +// pass from left to right. Instead we pass over the text node several times, +// emitting code for some character positions every time. See the comment on +// TextEmitPass for details. +void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { + LimitResult limit_result = LimitVersions(compiler, trace); + if (limit_result == DONE) return; + DCHECK(limit_result == CONTINUE); + + if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { + compiler->SetRegExpTooBig(); + return; + } + + if (compiler->one_byte()) { + int dummy = 0; + TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); + } + + bool first_elt_done = false; + int bound_checked_to = trace->cp_offset() - 1; + bound_checked_to += trace->bound_checked_up_to(); + + // If a character is preloaded into the current character register then + // check that now. + if (trace->characters_preloaded() == 1) { + for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { + TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace, + false, &bound_checked_to); + } + first_elt_done = true; + } + + for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { + TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace, + first_elt_done, &bound_checked_to); + } + + Trace successor_trace(*trace); + // If we advance backward, we may end up at the start. + successor_trace.AdvanceCurrentPositionInTrace( + read_backward() ? -Length() : Length(), compiler); + successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN + : Trace::FALSE_VALUE); + RecursionCheck rc(compiler); + on_success()->Emit(compiler, &successor_trace); +} + +void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; } + +void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { + // We don't have an instruction for shifting the current character register + // down or for using a shifted value for anything so lets just forget that + // we preloaded any characters into it. + characters_preloaded_ = 0; + // Adjust the offsets of the quick check performed information. This + // information is used to find out what we already determined about the + // characters by means of mask and compare. + quick_check_performed_.Advance(by, compiler->one_byte()); + cp_offset_ += by; + if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { + compiler->SetRegExpTooBig(); + cp_offset_ = 0; + } + bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by); +} + +void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte, + RegExpFlags flags) { + if (!IsIgnoreCase(flags)) return; +#ifdef V8_INTL_SUPPORT + if (NeedsUnicodeCaseEquivalents(flags)) return; +#endif + + int element_count = elements()->length(); + for (int i = 0; i < element_count; i++) { + TextElement elm = elements()->at(i); + if (elm.text_type() == TextElement::CLASS_RANGES) { + RegExpClassRanges* cr = elm.class_ranges(); + // None of the standard character classes is different in the case + // independent case and it slows us down if we don't know that. + if (cr->is_standard(zone())) continue; + ZoneList<CharacterRange>* ranges = cr->ranges(zone()); + CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); + } + } +} + +int TextNode::GreedyLoopTextLength() { return Length(); } + +RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( + RegExpCompiler* compiler) { + if (read_backward()) return nullptr; + if (elements()->length() != 1) return nullptr; + TextElement elm = elements()->at(0); + if (elm.text_type() != TextElement::CLASS_RANGES) return nullptr; + RegExpClassRanges* node = elm.class_ranges(); + ZoneList<CharacterRange>* ranges = node->ranges(zone()); + CharacterRange::Canonicalize(ranges); + if (node->is_negated()) { + return ranges->length() == 0 ? on_success() : nullptr; + } + if (ranges->length() != 1) return nullptr; + const base::uc32 max_char = MaxCodeUnit(compiler->one_byte()); + return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; +} + +// Finds the fixed match length of a sequence of nodes that goes from +// this alternative and back to this choice node. If there are variable +// length nodes or other complications in the way then return a sentinel +// value indicating that a greedy loop cannot be constructed. +int ChoiceNode::GreedyLoopTextLengthForAlternative( + GuardedAlternative* alternative) { + int length = 0; + RegExpNode* node = alternative->node(); + // Later we will generate code for all these text nodes using recursion + // so we have to limit the max number. + int recursion_depth = 0; + while (node != this) { + if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { + return kNodeIsTooComplexForGreedyLoops; + } + int node_length = node->GreedyLoopTextLength(); + if (node_length == kNodeIsTooComplexForGreedyLoops) { + return kNodeIsTooComplexForGreedyLoops; + } + length += node_length; + SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); + node = seq_node->on_success(); + } + if (read_backward()) { + length = -length; + } + // Check that we can jump by the whole text length. If not, return sentinel + // to indicate the we can't construct a greedy loop. + if (length < RegExpMacroAssembler::kMinCPOffset || + length > RegExpMacroAssembler::kMaxCPOffset) { + return kNodeIsTooComplexForGreedyLoops; + } + return length; +} + +void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { + DCHECK_NULL(loop_node_); + AddAlternative(alt); + loop_node_ = alt.node(); +} + +void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { + DCHECK_NULL(continue_node_); + AddAlternative(alt); + continue_node_ = alt.node(); +} + +void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + if (trace->stop_node() == this) { + // Back edge of greedy optimized loop node graph. + int text_length = + GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); + DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length); + // Update the counter-based backtracking info on the stack. This is an + // optimization for greedy loops (see below). + DCHECK(trace->cp_offset() == text_length); + macro_assembler->AdvanceCurrentPosition(text_length); + macro_assembler->GoTo(trace->loop_label()); + return; + } + DCHECK_NULL(trace->stop_node()); + if (!trace->is_trivial()) { + trace->Flush(compiler, this); + return; + } + ChoiceNode::Emit(compiler, trace); +} + +int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, + int eats_at_least) { + int preload_characters = std::min(4, eats_at_least); + DCHECK_LE(preload_characters, 4); + if (compiler->macro_assembler()->CanReadUnaligned()) { + bool one_byte = compiler->one_byte(); + if (one_byte) { + // We can't preload 3 characters because there is no machine instruction + // to do that. We can't just load 4 because we could be reading + // beyond the end of the string, which could cause a memory fault. + if (preload_characters == 3) preload_characters = 2; + } else { + if (preload_characters > 2) preload_characters = 2; + } + } else { + if (preload_characters > 1) preload_characters = 1; + } + return preload_characters; +} + +// This class is used when generating the alternatives in a choice node. It +// records the way the alternative is being code generated. +class AlternativeGeneration : public Malloced { + public: + AlternativeGeneration() + : possible_success(), + expects_preload(false), + after(), + quick_check_details() {} + Label possible_success; + bool expects_preload; + Label after; + QuickCheckDetails quick_check_details; +}; + +// Creates a list of AlternativeGenerations. If the list has a reasonable +// size then it is on the stack, otherwise the excess is on the heap. +class AlternativeGenerationList { + public: + AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) { + for (int i = 0; i < count && i < kAFew; i++) { + alt_gens_.Add(a_few_alt_gens_ + i, zone); + } + for (int i = kAFew; i < count; i++) { + alt_gens_.Add(new AlternativeGeneration(), zone); + } + } + ~AlternativeGenerationList() { + for (int i = kAFew; i < alt_gens_.length(); i++) { + delete alt_gens_[i]; + alt_gens_[i] = nullptr; + } + } + + AlternativeGeneration* at(int i) { return alt_gens_[i]; } + + private: + static const int kAFew = 10; + ZoneList<AlternativeGeneration*> alt_gens_; + AlternativeGeneration a_few_alt_gens_[kAFew]; +}; + +void BoyerMoorePositionInfo::Set(int character) { + SetInterval(Interval(character, character)); +} + +namespace { + +ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges, + int ranges_length, Interval new_range) { + DCHECK_EQ(1, ranges_length & 1); + DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]); + if (containment == kLatticeUnknown) return containment; + bool inside = false; + int last = 0; + for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { + // Consider the range from last to ranges[i]. + // We haven't got to the new range yet. + if (ranges[i] <= new_range.from()) continue; + // New range is wholly inside last-ranges[i]. Note that new_range.to() is + // inclusive, but the values in ranges are not. + if (last <= new_range.from() && new_range.to() < ranges[i]) { + return Combine(containment, inside ? kLatticeIn : kLatticeOut); + } + return kLatticeUnknown; + } + return containment; +} + +int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) { + static_assert(BoyerMoorePositionInfo::kMapSize == + 2 * kInt64Size * kBitsPerByte); + + // Slight fiddling is needed here, since the bitset is of length 128 while + // CountTrailingZeros requires an integral type and std::bitset can only + // convert to unsigned long long. So we handle the most- and least-significant + // bits separately. + + { + static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0}); + BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask; + static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong()))); + uint64_t lsb = masked_bitset.to_ullong(); + if (lsb != 0) return base::bits::CountTrailingZeros(lsb); + } + + { + BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64; + uint64_t msb = masked_bitset.to_ullong(); + if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb); + } + + return -1; +} + +} // namespace + +void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { + w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); + + if (interval.size() >= kMapSize) { + map_count_ = kMapSize; + map_.set(); + return; + } + + for (int i = interval.from(); i <= interval.to(); i++) { + int mod_character = (i & kMask); + if (!map_[mod_character]) { + map_count_++; + map_.set(mod_character); + } + if (map_count_ == kMapSize) return; + } +} + +void BoyerMoorePositionInfo::SetAll() { + w_ = kLatticeUnknown; + if (map_count_ != kMapSize) { + map_count_ = kMapSize; + map_.set(); + } +} + +BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler, + Zone* zone) + : length_(length), + compiler_(compiler), + max_char_(MaxCodeUnit(compiler->one_byte())) { + bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone); + for (int i = 0; i < length; i++) { + bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone); + } +} + +// Find the longest range of lookahead that has the fewest number of different +// characters that can occur at a given position. Since we are optimizing two +// different parameters at once this is a tradeoff. +bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { + int biggest_points = 0; + // If more than 32 characters out of 128 can occur it is unlikely that we can + // be lucky enough to step forwards much of the time. + const int kMaxMax = 32; + for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax; + max_number_of_chars *= 2) { + biggest_points = + FindBestInterval(max_number_of_chars, biggest_points, from, to); + } + if (biggest_points == 0) return false; + return true; +} + +// Find the highest-points range between 0 and length_ where the character +// information is not too vague. 'Too vague' means that there are more than +// max_number_of_chars that can occur at this position. Calculates the number +// of points as the product of width-of-the-range and +// probability-of-finding-one-of-the-characters, where the probability is +// calculated using the frequency distribution of the sample subject string. +int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars, + int old_biggest_points, int* from, + int* to) { + int biggest_points = old_biggest_points; + static const int kSize = RegExpMacroAssembler::kTableSize; + for (int i = 0; i < length_;) { + while (i < length_ && Count(i) > max_number_of_chars) i++; + if (i == length_) break; + int remembered_from = i; + + BoyerMoorePositionInfo::Bitset union_bitset; + for (; i < length_ && Count(i) <= max_number_of_chars; i++) { + union_bitset |= bitmaps_->at(i)->raw_bitset(); + } + + int frequency = 0; + + // Iterate only over set bits. + int j; + while ((j = BitsetFirstSetBit(union_bitset)) != -1) { + DCHECK(union_bitset[j]); // Sanity check. + // Add 1 to the frequency to give a small per-character boost for + // the cases where our sampling is not good enough and many + // characters have a frequency of zero. This means the frequency + // can theoretically be up to 2*kSize though we treat it mostly as + // a fraction of kSize. + frequency += compiler_->frequency_collator()->Frequency(j) + 1; + union_bitset.reset(j); + } + + // We use the probability of skipping times the distance we are skipping to + // judge the effectiveness of this. Actually we have a cut-off: By + // dividing by 2 we switch off the skipping if the probability of skipping + // is less than 50%. This is because the multibyte mask-and-compare + // skipping in quickcheck is more likely to do well on this case. + bool in_quickcheck_range = + ((i - remembered_from < 4) || + (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); + // Called 'probability' but it is only a rough estimate and can actually + // be outside the 0-kSize range. + int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; + int points = (i - remembered_from) * probability; + if (points > biggest_points) { + *from = remembered_from; + *to = i - 1; + biggest_points = points; + } + } + return biggest_points; +} + +// Take all the characters that will not prevent a successful match if they +// occur in the subject string in the range between min_lookahead and +// max_lookahead (inclusive) measured from the current position. If the +// character at max_lookahead offset is not one of these characters, then we +// can safely skip forwards by the number of characters in the range. +int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead, + Handle<ByteArray> boolean_skip_table) { + const int kSkipArrayEntry = 0; + const int kDontSkipArrayEntry = 1; + + std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry, + boolean_skip_table->length()); + + for (int i = max_lookahead; i >= min_lookahead; i--) { + BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset(); + + // Iterate only over set bits. + int j; + while ((j = BitsetFirstSetBit(bitset)) != -1) { + DCHECK(bitset[j]); // Sanity check. + boolean_skip_table->set(j, kDontSkipArrayEntry); + bitset.reset(j); + } + } + + const int skip = max_lookahead + 1 - min_lookahead; + return skip; +} + +// See comment above on the implementation of GetSkipTable. +void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { + const int kSize = RegExpMacroAssembler::kTableSize; + + int min_lookahead = 0; + int max_lookahead = 0; + + if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; + + // Check if we only have a single non-empty position info, and that info + // contains precisely one character. + bool found_single_character = false; + int single_character = 0; + for (int i = max_lookahead; i >= min_lookahead; i--) { + BoyerMoorePositionInfo* map = bitmaps_->at(i); + if (map->map_count() == 0) continue; + + if (found_single_character || map->map_count() > 1) { + found_single_character = false; + break; + } + + DCHECK(!found_single_character); + DCHECK_EQ(map->map_count(), 1); + + found_single_character = true; + single_character = BitsetFirstSetBit(map->raw_bitset()); + + DCHECK_NE(single_character, -1); + } + + int lookahead_width = max_lookahead + 1 - min_lookahead; + + if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { + // The mask-compare can probably handle this better. + return; + } + + if (found_single_character) { + Label cont, again; + masm->Bind(&again); + masm->LoadCurrentCharacter(max_lookahead, &cont, true); + if (max_char_ > kSize) { + masm->CheckCharacterAfterAnd(single_character, + RegExpMacroAssembler::kTableMask, &cont); + } else { + masm->CheckCharacter(single_character, &cont); + } + masm->AdvanceCurrentPosition(lookahead_width); + masm->GoTo(&again); + masm->Bind(&cont); + return; + } + + Factory* factory = masm->isolate()->factory(); + Handle<ByteArray> boolean_skip_table = + factory->NewByteArray(kSize, AllocationType::kOld); + int skip_distance = + GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table); + DCHECK_NE(0, skip_distance); + + Label cont, again; + masm->Bind(&again); + masm->LoadCurrentCharacter(max_lookahead, &cont, true); + masm->CheckBitInTable(boolean_skip_table, &cont); + masm->AdvanceCurrentPosition(skip_distance); + masm->GoTo(&again); + masm->Bind(&cont); +} + +/* Code generation for choice nodes. + * + * We generate quick checks that do a mask and compare to eliminate a + * choice. If the quick check succeeds then it jumps to the continuation to + * do slow checks and check subsequent nodes. If it fails (the common case) + * it falls through to the next choice. + * + * Here is the desired flow graph. Nodes directly below each other imply + * fallthrough. Alternatives 1 and 2 have quick checks. Alternative + * 3 doesn't have a quick check so we have to call the slow check. + * Nodes are marked Qn for quick checks and Sn for slow checks. The entire + * regexp continuation is generated directly after the Sn node, up to the + * next GoTo if we decide to reuse some already generated code. Some + * nodes expect preload_characters to be preloaded into the current + * character register. R nodes do this preloading. Vertices are marked + * F for failures and S for success (possible success in the case of quick + * nodes). L, V, < and > are used as arrow heads. + * + * ----------> R + * | + * V + * Q1 -----> S1 + * | S / + * F| / + * | F/ + * | / + * | R + * | / + * V L + * Q2 -----> S2 + * | S / + * F| / + * | F/ + * | / + * | R + * | / + * V L + * S3 + * | + * F| + * | + * R + * | + * backtrack V + * <----------Q4 + * \ F | + * \ |S + * \ F V + * \-----S4 + * + * For greedy loops we push the current position, then generate the code that + * eats the input specially in EmitGreedyLoop. The other choice (the + * continuation) is generated by the normal code in EmitChoices, and steps back + * in the input to the starting position when it fails to match. The loop code + * looks like this (U is the unwind code that steps back in the greedy loop). + * + * _____ + * / \ + * V | + * ----------> S1 | + * /| | + * / |S | + * F/ \_____/ + * / + * |<----- + * | \ + * V |S + * Q2 ---> U----->backtrack + * | F / + * S| / + * V F / + * S2--/ + */ + +GreedyLoopState::GreedyLoopState(bool not_at_start) { + counter_backtrack_trace_.set_backtrack(&label_); + if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); +} + +void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { +#ifdef DEBUG + int choice_count = alternatives_->length(); + for (int i = 0; i < choice_count - 1; i++) { + GuardedAlternative alternative = alternatives_->at(i); + ZoneList<Guard*>* guards = alternative.guards(); + int guard_count = (guards == nullptr) ? 0 : guards->length(); + for (int j = 0; j < guard_count; j++) { + DCHECK(!trace->mentions_reg(guards->at(j)->reg())); + } + } +#endif +} + +void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, + PreloadState* state) { + if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { + // Save some time by looking at most one machine word ahead. + state->eats_at_least_ = + EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE); + } + state->preload_characters_ = + CalculatePreloadCharacters(compiler, state->eats_at_least_); + + state->preload_is_current_ = + (current_trace->characters_preloaded() == state->preload_characters_); + state->preload_has_checked_bounds_ = state->preload_is_current_; +} + +void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { + int choice_count = alternatives_->length(); + + if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) { + alternatives_->at(0).node()->Emit(compiler, trace); + return; + } + + AssertGuardsMentionRegisters(trace); + + LimitResult limit_result = LimitVersions(compiler, trace); + if (limit_result == DONE) return; + DCHECK(limit_result == CONTINUE); + + // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for + // other choice nodes we only flush if we are out of code size budget. + if (trace->flush_budget() == 0 && trace->actions() != nullptr) { + trace->Flush(compiler, this); + return; + } + + RecursionCheck rc(compiler); + + PreloadState preload; + preload.init(); + GreedyLoopState greedy_loop_state(not_at_start()); + + int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); + AlternativeGenerationList alt_gens(choice_count, zone()); + + if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { + trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload, + &greedy_loop_state, text_length); + } else { + // TODO(erikcorry): Delete this. We don't need this label, but it makes us + // match the traces produced pre-cleanup. + Label second_choice; + compiler->macro_assembler()->Bind(&second_choice); + + preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); + + EmitChoices(compiler, &alt_gens, 0, trace, &preload); + } + + // At this point we need to generate slow checks for the alternatives where + // the quick check was inlined. We can recognize these because the associated + // label was bound. + int new_flush_budget = trace->flush_budget() / choice_count; + for (int i = 0; i < choice_count; i++) { + AlternativeGeneration* alt_gen = alt_gens.at(i); + Trace new_trace(*trace); + // If there are actions to be flushed we have to limit how many times + // they are flushed. Take the budget of the parent trace and distribute + // it fairly amongst the children. + if (new_trace.actions() != nullptr) { + new_trace.set_flush_budget(new_flush_budget); + } + bool next_expects_preload = + i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; + EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i), + alt_gen, preload.preload_characters_, + next_expects_preload); + } +} + +Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, + AlternativeGenerationList* alt_gens, + PreloadState* preload, + GreedyLoopState* greedy_loop_state, + int text_length) { + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + // Here we have special handling for greedy loops containing only text nodes + // and other simple nodes. These are handled by pushing the current + // position on the stack and then incrementing the current position each + // time around the switch. On backtrack we decrement the current position + // and check it against the pushed value. This avoids pushing backtrack + // information for each iteration of the loop, which could take up a lot of + // space. + DCHECK(trace->stop_node() == nullptr); + macro_assembler->PushCurrentPosition(); + Label greedy_match_failed; + Trace greedy_match_trace; + if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); + greedy_match_trace.set_backtrack(&greedy_match_failed); + Label loop_label; + macro_assembler->Bind(&loop_label); + greedy_match_trace.set_stop_node(this); + greedy_match_trace.set_loop_label(&loop_label); + alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); + macro_assembler->Bind(&greedy_match_failed); + + Label second_choice; // For use in greedy matches. + macro_assembler->Bind(&second_choice); + + Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); + + EmitChoices(compiler, alt_gens, 1, new_trace, preload); + + macro_assembler->Bind(greedy_loop_state->label()); + // If we have unwound to the bottom then backtrack. + macro_assembler->CheckGreedyLoop(trace->backtrack()); + // Otherwise try the second priority at an earlier position. + macro_assembler->AdvanceCurrentPosition(-text_length); + macro_assembler->GoTo(&second_choice); + return new_trace; +} + +int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, + Trace* trace) { + int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; + if (alternatives_->length() != 2) return eats_at_least; + + GuardedAlternative alt1 = alternatives_->at(1); + if (alt1.guards() != nullptr && alt1.guards()->length() != 0) { + return eats_at_least; + } + RegExpNode* eats_anything_node = alt1.node(); + if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { + return eats_at_least; + } + + // Really we should be creating a new trace when we execute this function, + // but there is no need, because the code it generates cannot backtrack, and + // we always arrive here with a trivial trace (since it's the entry to a + // loop. That also implies that there are no preloaded characters, which is + // good, because it means we won't be violating any assumptions by + // overwriting those characters with new load instructions. + DCHECK(trace->is_trivial()); + + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + Isolate* isolate = macro_assembler->isolate(); + // At this point we know that we are at a non-greedy loop that will eat + // any character one at a time. Any non-anchored regexp has such a + // loop prepended to it in order to find where it starts. We look for + // a pattern of the form ...abc... where we can look 6 characters ahead + // and step forwards 3 if the character is not one of abc. Abc need + // not be atoms, they can be any reasonably limited character class or + // small alternation. + BoyerMooreLookahead* bm = bm_info(false); + if (bm == nullptr) { + eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false)); + if (eats_at_least >= 1) { + bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone()); + GuardedAlternative alt0 = alternatives_->at(0); + alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); + } + } + if (bm != nullptr) { + bm->EmitSkipInstructions(macro_assembler); + } + return eats_at_least; +} + +void ChoiceNode::EmitChoices(RegExpCompiler* compiler, + AlternativeGenerationList* alt_gens, + int first_choice, Trace* trace, + PreloadState* preload) { + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + SetUpPreLoad(compiler, trace, preload); + + // For now we just call all choices one after the other. The idea ultimately + // is to use the Dispatch table to try only the relevant ones. + int choice_count = alternatives_->length(); + + int new_flush_budget = trace->flush_budget() / choice_count; + + for (int i = first_choice; i < choice_count; i++) { + bool is_last = i == choice_count - 1; + bool fall_through_on_failure = !is_last; + GuardedAlternative alternative = alternatives_->at(i); + AlternativeGeneration* alt_gen = alt_gens->at(i); + alt_gen->quick_check_details.set_characters(preload->preload_characters_); + ZoneList<Guard*>* guards = alternative.guards(); + int guard_count = (guards == nullptr) ? 0 : guards->length(); + Trace new_trace(*trace); + new_trace.set_characters_preloaded( + preload->preload_is_current_ ? preload->preload_characters_ : 0); + if (preload->preload_has_checked_bounds_) { + new_trace.set_bound_checked_up_to(preload->preload_characters_); + } + new_trace.quick_check_performed()->Clear(); + if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); + if (!is_last) { + new_trace.set_backtrack(&alt_gen->after); + } + alt_gen->expects_preload = preload->preload_is_current_; + bool generate_full_check_inline = false; + if (compiler->optimize() && + try_to_emit_quick_check_for_alternative(i == 0) && + alternative.node()->EmitQuickCheck( + compiler, trace, &new_trace, preload->preload_has_checked_bounds_, + &alt_gen->possible_success, &alt_gen->quick_check_details, + fall_through_on_failure, this)) { + // Quick check was generated for this choice. + preload->preload_is_current_ = true; + preload->preload_has_checked_bounds_ = true; + // If we generated the quick check to fall through on possible success, + // we now need to generate the full check inline. + if (!fall_through_on_failure) { + macro_assembler->Bind(&alt_gen->possible_success); + new_trace.set_quick_check_performed(&alt_gen->quick_check_details); + new_trace.set_characters_preloaded(preload->preload_characters_); + new_trace.set_bound_checked_up_to(preload->preload_characters_); + generate_full_check_inline = true; + } + } else if (alt_gen->quick_check_details.cannot_match()) { + if (!fall_through_on_failure) { + macro_assembler->GoTo(trace->backtrack()); + } + continue; + } else { + // No quick check was generated. Put the full code here. + // If this is not the first choice then there could be slow checks from + // previous cases that go here when they fail. There's no reason to + // insist that they preload characters since the slow check we are about + // to generate probably can't use it. + if (i != first_choice) { + alt_gen->expects_preload = false; + new_trace.InvalidateCurrentCharacter(); + } + generate_full_check_inline = true; + } + if (generate_full_check_inline) { + if (new_trace.actions() != nullptr) { + new_trace.set_flush_budget(new_flush_budget); + } + for (int j = 0; j < guard_count; j++) { + GenerateGuard(macro_assembler, guards->at(j), &new_trace); + } + alternative.node()->Emit(compiler, &new_trace); + preload->preload_is_current_ = false; + } + macro_assembler->Bind(&alt_gen->after); + } +} + +void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, + Trace* trace, + GuardedAlternative alternative, + AlternativeGeneration* alt_gen, + int preload_characters, + bool next_expects_preload) { + if (!alt_gen->possible_success.is_linked()) return; + + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + macro_assembler->Bind(&alt_gen->possible_success); + Trace out_of_line_trace(*trace); + out_of_line_trace.set_characters_preloaded(preload_characters); + out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); + if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); + ZoneList<Guard*>* guards = alternative.guards(); + int guard_count = (guards == nullptr) ? 0 : guards->length(); + if (next_expects_preload) { + Label reload_current_char; + out_of_line_trace.set_backtrack(&reload_current_char); + for (int j = 0; j < guard_count; j++) { + GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); + } + alternative.node()->Emit(compiler, &out_of_line_trace); + macro_assembler->Bind(&reload_current_char); + // Reload the current character, since the next quick check expects that. + // We don't need to check bounds here because we only get into this + // code through a quick check which already did the checked load. + macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false, + preload_characters); + macro_assembler->GoTo(&(alt_gen->after)); + } else { + out_of_line_trace.set_backtrack(&(alt_gen->after)); + for (int j = 0; j < guard_count; j++) { + GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); + } + alternative.node()->Emit(compiler, &out_of_line_trace); + } +} + +void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + LimitResult limit_result = LimitVersions(compiler, trace); + if (limit_result == DONE) return; + DCHECK(limit_result == CONTINUE); + + RecursionCheck rc(compiler); + + switch (action_type_) { + case STORE_POSITION: { + Trace::DeferredCapture new_capture(data_.u_position_register.reg, + data_.u_position_register.is_capture, + trace); + Trace new_trace = *trace; + new_trace.add_action(&new_capture); + on_success()->Emit(compiler, &new_trace); + break; + } + case INCREMENT_REGISTER: { + Trace::DeferredIncrementRegister new_increment( + data_.u_increment_register.reg); + Trace new_trace = *trace; + new_trace.add_action(&new_increment); + on_success()->Emit(compiler, &new_trace); + break; + } + case SET_REGISTER_FOR_LOOP: { + Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg, + data_.u_store_register.value); + Trace new_trace = *trace; + new_trace.add_action(&new_set); + on_success()->Emit(compiler, &new_trace); + break; + } + case CLEAR_CAPTURES: { + Trace::DeferredClearCaptures new_capture(Interval( + data_.u_clear_captures.range_from, data_.u_clear_captures.range_to)); + Trace new_trace = *trace; + new_trace.add_action(&new_capture); + on_success()->Emit(compiler, &new_trace); + break; + } + case BEGIN_POSITIVE_SUBMATCH: + case BEGIN_NEGATIVE_SUBMATCH: + if (!trace->is_trivial()) { + trace->Flush(compiler, this); + } else { + assembler->WriteCurrentPositionToRegister( + data_.u_submatch.current_position_register, 0); + assembler->WriteStackPointerToRegister( + data_.u_submatch.stack_pointer_register); + on_success()->Emit(compiler, trace); + } + break; + case EMPTY_MATCH_CHECK: { + int start_pos_reg = data_.u_empty_match_check.start_register; + int stored_pos = 0; + int rep_reg = data_.u_empty_match_check.repetition_register; + bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); + bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); + if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { + // If we know we haven't advanced and there is no minimum we + // can just backtrack immediately. + assembler->GoTo(trace->backtrack()); + } else if (know_dist && stored_pos < trace->cp_offset()) { + // If we know we've advanced we can generate the continuation + // immediately. + on_success()->Emit(compiler, trace); + } else if (!trace->is_trivial()) { + trace->Flush(compiler, this); + } else { + Label skip_empty_check; + // If we have a minimum number of repetitions we check the current + // number first and skip the empty check if it's not enough. + if (has_minimum) { + int limit = data_.u_empty_match_check.repetition_limit; + assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); + } + // If the match is empty we bail out, otherwise we fall through + // to the on-success continuation. + assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, + trace->backtrack()); + assembler->Bind(&skip_empty_check); + on_success()->Emit(compiler, trace); + } + break; + } + case POSITIVE_SUBMATCH_SUCCESS: { + if (!trace->is_trivial()) { + trace->Flush(compiler, this); + return; + } + assembler->ReadCurrentPositionFromRegister( + data_.u_submatch.current_position_register); + assembler->ReadStackPointerFromRegister( + data_.u_submatch.stack_pointer_register); + int clear_register_count = data_.u_submatch.clear_register_count; + if (clear_register_count == 0) { + on_success()->Emit(compiler, trace); + return; + } + int clear_registers_from = data_.u_submatch.clear_register_from; + Label clear_registers_backtrack; + Trace new_trace = *trace; + new_trace.set_backtrack(&clear_registers_backtrack); + on_success()->Emit(compiler, &new_trace); + + assembler->Bind(&clear_registers_backtrack); + int clear_registers_to = clear_registers_from + clear_register_count - 1; + assembler->ClearRegisters(clear_registers_from, clear_registers_to); + + DCHECK(trace->backtrack() == nullptr); + assembler->Backtrack(); + return; + } + default: + UNREACHABLE(); + } +} + +void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + if (!trace->is_trivial()) { + trace->Flush(compiler, this); + return; + } + + LimitResult limit_result = LimitVersions(compiler, trace); + if (limit_result == DONE) return; + DCHECK(limit_result == CONTINUE); + + RecursionCheck rc(compiler); + + DCHECK_EQ(start_reg_ + 1, end_reg_); + if (IsIgnoreCase(flags_)) { + bool unicode = IsEitherUnicode(flags_); + assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), + unicode, trace->backtrack()); + } else { + assembler->CheckNotBackReference(start_reg_, read_backward(), + trace->backtrack()); + } + // We are going to advance backward, so we may end up at the start. + if (read_backward()) trace->set_at_start(Trace::UNKNOWN); + + // Check that the back reference does not end inside a surrogate pair. + if (IsEitherUnicode(flags_) && !compiler->one_byte()) { + assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); + } + on_success()->Emit(compiler, trace); +} + +void TextNode::CalculateOffsets() { + int element_count = elements()->length(); + // Set up the offsets of the elements relative to the start. This is a fixed + // quantity since a TextNode can only contain fixed-width things. + int cp_offset = 0; + for (int i = 0; i < element_count; i++) { + TextElement& elm = elements()->at(i); + elm.set_cp_offset(cp_offset); + cp_offset += elm.length(); + } +} + +namespace { + +// Assertion propagation moves information about assertions such as +// \b to the affected nodes. For instance, in /.\b./ information must +// be propagated to the first '.' that whatever follows needs to know +// if it matched a word or a non-word, and to the second '.' that it +// has to check if it succeeds a word or non-word. In this case the +// result will be something like: +// +// +-------+ +------------+ +// | . | | . | +// +-------+ ---> +------------+ +// | word? | | check word | +// +-------+ +------------+ +class AssertionPropagator : public AllStatic { + public: + static void VisitText(TextNode* that) {} + + static void VisitAction(ActionNode* that) { + // If the next node is interested in what it follows then this node + // has to be interested too so it can pass the information on. + that->info()->AddFromFollowing(that->on_success()->info()); + } + + static void VisitChoice(ChoiceNode* that, int i) { + // Anything the following nodes need to know has to be known by + // this node also, so it can pass it on. + that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info()); + } + + static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { + that->info()->AddFromFollowing(that->continue_node()->info()); + } + + static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) { + that->info()->AddFromFollowing(that->loop_node()->info()); + } + + static void VisitNegativeLookaroundChoiceLookaroundNode( + NegativeLookaroundChoiceNode* that) { + VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex); + } + + static void VisitNegativeLookaroundChoiceContinueNode( + NegativeLookaroundChoiceNode* that) { + VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex); + } + + static void VisitBackReference(BackReferenceNode* that) {} + + static void VisitAssertion(AssertionNode* that) {} +}; + +// Propagates information about the minimum size of successful matches from +// successor nodes to their predecessors. Note that all eats_at_least values +// are initialized to zero before analysis. +class EatsAtLeastPropagator : public AllStatic { + public: + static void VisitText(TextNode* that) { + // The eats_at_least value is not used if reading backward. + if (!that->read_backward()) { + // We are not at the start after this node, and thus we can use the + // successor's eats_at_least_from_not_start value. + uint8_t eats_at_least = base::saturated_cast<uint8_t>( + that->Length() + that->on_success() + ->eats_at_least_info() + ->eats_at_least_from_not_start); + that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least)); + } + } + + static void VisitAction(ActionNode* that) { + switch (that->action_type()) { + case ActionNode::BEGIN_POSITIVE_SUBMATCH: + case ActionNode::POSITIVE_SUBMATCH_SUCCESS: + // We do not propagate eats_at_least data through positive lookarounds, + // because they rewind input. + // TODO(v8:11859) Potential approaches for fixing this include: + // 1. Add a dedicated choice node for positive lookaround, similar to + // NegativeLookaroundChoiceNode. + // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which + // is <= eats_at_least_from_possibly_start, and use that value in + // EatsAtLeastFromLoopEntry. + DCHECK(that->eats_at_least_info()->IsZero()); + break; + case ActionNode::SET_REGISTER_FOR_LOOP: + // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the + // loop body will run at least the minimum number of times before the + // continuation case can run. + that->set_eats_at_least_info( + that->on_success()->EatsAtLeastFromLoopEntry()); + break; + case ActionNode::BEGIN_NEGATIVE_SUBMATCH: + default: + // Otherwise, the current node eats at least as much as its successor. + // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH + // because NegativeLookaroundChoiceNode ignores its lookaround successor + // when computing eats-at-least and quick check information. + that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); + break; + } + } + + static void VisitChoice(ChoiceNode* that, int i) { + // The minimum possible match from a choice node is the minimum of its + // successors. + EatsAtLeastInfo eats_at_least = + i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info(); + eats_at_least.SetMin( + *that->alternatives()->at(i).node()->eats_at_least_info()); + that->set_eats_at_least_info(eats_at_least); + } + + static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { + if (!that->read_backward()) { + that->set_eats_at_least_info( + *that->continue_node()->eats_at_least_info()); + } + } + + static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {} + + static void VisitNegativeLookaroundChoiceLookaroundNode( + NegativeLookaroundChoiceNode* that) {} + + static void VisitNegativeLookaroundChoiceContinueNode( + NegativeLookaroundChoiceNode* that) { + that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info()); + } + + static void VisitBackReference(BackReferenceNode* that) { + if (!that->read_backward()) { + that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); + } + } + + static void VisitAssertion(AssertionNode* that) { + EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info(); + if (that->assertion_type() == AssertionNode::AT_START) { + // If we know we are not at the start and we are asked "how many + // characters will you match if you succeed?" then we can answer anything + // since false implies false. So let's just set the max answer + // (UINT8_MAX) since that won't prevent us from preloading a lot of + // characters for the other branches in the node graph. + eats_at_least.eats_at_least_from_not_start = UINT8_MAX; + } + that->set_eats_at_least_info(eats_at_least); + } +}; + +} // namespace + +// ------------------------------------------------------------------- +// Analysis + +// Iterates the node graph and provides the opportunity for propagators to set +// values that depend on successor nodes. +template <typename... Propagators> +class Analysis : public NodeVisitor { + public: + Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags) + : isolate_(isolate), + is_one_byte_(is_one_byte), + flags_(flags), + error_(RegExpError::kNone) {} + + void EnsureAnalyzed(RegExpNode* that) { + StackLimitCheck check(isolate()); + if (check.HasOverflowed()) { + if (v8_flags.correctness_fuzzer_suppressions) { + FATAL("Analysis: Aborting on stack overflow"); + } + fail(RegExpError::kAnalysisStackOverflow); + return; + } + if (that->info()->been_analyzed || that->info()->being_analyzed) return; + that->info()->being_analyzed = true; + that->Accept(this); + that->info()->being_analyzed = false; + that->info()->been_analyzed = true; + } + + bool has_failed() { return error_ != RegExpError::kNone; } + RegExpError error() { + DCHECK(error_ != RegExpError::kNone); + return error_; + } + void fail(RegExpError error) { error_ = error; } + + Isolate* isolate() const { return isolate_; } + + void VisitEnd(EndNode* that) override { + // nothing to do + } + +// Used to call the given static function on each propagator / variadic template +// argument. +#define STATIC_FOR_EACH(expr) \ + do { \ + int dummy[] = {((expr), 0)...}; \ + USE(dummy); \ + } while (false) + + void VisitText(TextNode* that) override { + that->MakeCaseIndependent(isolate(), is_one_byte_, flags_); + EnsureAnalyzed(that->on_success()); + if (has_failed()) return; + that->CalculateOffsets(); + STATIC_FOR_EACH(Propagators::VisitText(that)); + } + + void VisitAction(ActionNode* that) override { + EnsureAnalyzed(that->on_success()); + if (has_failed()) return; + STATIC_FOR_EACH(Propagators::VisitAction(that)); + } + + void VisitChoice(ChoiceNode* that) override { + for (int i = 0; i < that->alternatives()->length(); i++) { + EnsureAnalyzed(that->alternatives()->at(i).node()); + if (has_failed()) return; + STATIC_FOR_EACH(Propagators::VisitChoice(that, i)); + } + } + + void VisitLoopChoice(LoopChoiceNode* that) override { + DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue. + + // First propagate all information from the continuation node. + EnsureAnalyzed(that->continue_node()); + if (has_failed()) return; + STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that)); + + // Check the loop last since it may need the value of this node + // to get a correct result. + EnsureAnalyzed(that->loop_node()); + if (has_failed()) return; + STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that)); + } + + void VisitNegativeLookaroundChoice( + NegativeLookaroundChoiceNode* that) override { + DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue. + + EnsureAnalyzed(that->lookaround_node()); + if (has_failed()) return; + STATIC_FOR_EACH( + Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that)); + + EnsureAnalyzed(that->continue_node()); + if (has_failed()) return; + STATIC_FOR_EACH( + Propagators::VisitNegativeLookaroundChoiceContinueNode(that)); + } + + void VisitBackReference(BackReferenceNode* that) override { + EnsureAnalyzed(that->on_success()); + if (has_failed()) return; + STATIC_FOR_EACH(Propagators::VisitBackReference(that)); + } + + void VisitAssertion(AssertionNode* that) override { + EnsureAnalyzed(that->on_success()); + if (has_failed()) return; + STATIC_FOR_EACH(Propagators::VisitAssertion(that)); + } + +#undef STATIC_FOR_EACH + + private: + Isolate* isolate_; + const bool is_one_byte_; + const RegExpFlags flags_; + RegExpError error_; + + DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); +}; + +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, + RegExpNode* node) { + Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis( + isolate, is_one_byte, flags); + DCHECK_EQ(node->info()->been_analyzed, false); + analysis.EnsureAnalyzed(node); + DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); + return analysis.has_failed() ? analysis.error() : RegExpError::kNone; +} + +void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, + bool not_at_start) { + // Working out the set of characters that a backreference can match is too + // hard, so we just say that any character can match. + bm->SetRest(offset); + SaveBMInfo(bm, not_at_start, offset); +} + +static_assert(BoyerMoorePositionInfo::kMapSize == + RegExpMacroAssembler::kTableSize); + +void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) { + ZoneList<GuardedAlternative>* alts = alternatives(); + budget = (budget - 1) / alts->length(); + for (int i = 0; i < alts->length(); i++) { + GuardedAlternative& alt = alts->at(i); + if (alt.guards() != nullptr && alt.guards()->length() != 0) { + bm->SetRest(offset); // Give up trying to fill in info. + SaveBMInfo(bm, not_at_start, offset); + return; + } + alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); + } + SaveBMInfo(bm, not_at_start, offset); +} + +void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) { + if (initial_offset >= bm->length()) return; + int offset = initial_offset; + int max_char = bm->max_char(); + for (int i = 0; i < elements()->length(); i++) { + if (offset >= bm->length()) { + if (initial_offset == 0) set_bm_info(not_at_start, bm); + return; + } + TextElement text = elements()->at(i); + if (text.text_type() == TextElement::ATOM) { + RegExpAtom* atom = text.atom(); + for (int j = 0; j < atom->length(); j++, offset++) { + if (offset >= bm->length()) { + if (initial_offset == 0) set_bm_info(not_at_start, bm); + return; + } + base::uc16 character = atom->data()[j]; + if (IsIgnoreCase(bm->compiler()->flags())) { + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters( + isolate, character, bm->max_char() == String::kMaxOneByteCharCode, + chars, 4); + for (int k = 0; k < length; k++) { + bm->Set(offset, chars[k]); + } + } else { + if (character <= max_char) bm->Set(offset, character); + } + } + } else { + DCHECK_EQ(TextElement::CLASS_RANGES, text.text_type()); + RegExpClassRanges* class_ranges = text.class_ranges(); + ZoneList<CharacterRange>* ranges = class_ranges->ranges(zone()); + if (class_ranges->is_negated()) { + bm->SetAll(offset); + } else { + for (int k = 0; k < ranges->length(); k++) { + CharacterRange& range = ranges->at(k); + if (static_cast<int>(range.from()) > max_char) continue; + int to = std::min(max_char, static_cast<int>(range.to())); + bm->SetInterval(offset, Interval(range.from(), to)); + } + } + offset++; + } + } + if (offset >= bm->length()) { + if (initial_offset == 0) set_bm_info(not_at_start, bm); + return; + } + on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, + true); // Not at start after a text node. + if (initial_offset == 0) set_bm_info(not_at_start, bm); +} + +RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( + RegExpNode* on_success) { + DCHECK(!read_backward()); + ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( + zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); + ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( + zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); + + ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone()); + + int stack_register = UnicodeLookaroundStackRegister(); + int position_register = UnicodeLookaroundPositionRegister(); + RegExpNode* step_back = TextNode::CreateForCharacterRanges( + zone(), lead_surrogates, true, on_success); + RegExpLookaround::Builder builder(true, step_back, stack_register, + position_register); + RegExpNode* match_trail = TextNode::CreateForCharacterRanges( + zone(), trail_surrogates, false, builder.on_match_success()); + + optional_step_back->AddAlternative( + GuardedAlternative(builder.ForMatch(match_trail))); + optional_step_back->AddAlternative(GuardedAlternative(on_success)); + + return optional_step_back; +} + +RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, + RegExpFlags flags, + bool is_one_byte) { + // Wrap the body of the regexp in capture #0. + RegExpNode* captured_body = + RegExpCapture::ToNode(data->tree, 0, this, accept()); + RegExpNode* node = captured_body; + if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) { + // Add a .*? at the beginning, outside the body capture, unless + // this expression is anchored at the beginning or sticky. + RegExpNode* loop_node = RegExpQuantifier::ToNode( + 0, RegExpTree::kInfinity, false, + zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything), this, + captured_body, data->contains_anchor); + + if (data->contains_anchor) { + // Unroll loop once, to take care of the case that might start + // at the start of input. + ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone()); + first_step_node->AddAlternative(GuardedAlternative(captured_body)); + first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>( + zone()->New<RegExpClassRanges>(StandardCharacterSet::kEverything), + false, loop_node))); + node = first_step_node; + } else { + node = loop_node; + } + } + if (is_one_byte) { + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); + // Do it again to propagate the new nodes to places where they were not + // put because they had not been calculated yet. + if (node != nullptr) { + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); + } + } else if (IsEitherUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) { + node = OptionallyStepBackToLeadSurrogate(node); + } + + if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone()); + return node; +} + +void RegExpCompiler::ToNodeCheckForStackOverflow() { + if (StackLimitCheck{isolate()}.HasOverflowed()) { + V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler"); + } +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-compiler.h b/js/src/irregexp/imported/regexp-compiler.h new file mode 100644 index 0000000000..91dd43ab8a --- /dev/null +++ b/js/src/irregexp/imported/regexp-compiler.h @@ -0,0 +1,621 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_COMPILER_H_ +#define V8_REGEXP_REGEXP_COMPILER_H_ + +#include <bitset> + +#include "irregexp/imported/regexp-nodes.h" + +namespace v8 { +namespace internal { + +class DynamicBitSet; +class Isolate; + +namespace regexp_compiler_constants { + +// The '2' variant is has inclusive from and exclusive to. +// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, +// which include WhiteSpace (7.2) or LineTerminator (7.3) values. +constexpr base::uc32 kRangeEndMarker = 0x110000; +constexpr int kSpaceRanges[] = { + '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, + 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, + 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; +constexpr int kSpaceRangeCount = arraysize(kSpaceRanges); + +constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', + '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; +constexpr int kWordRangeCount = arraysize(kWordRanges); +constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; +constexpr int kDigitRangeCount = arraysize(kDigitRanges); +constexpr int kSurrogateRanges[] = {kLeadSurrogateStart, + kLeadSurrogateStart + 1, kRangeEndMarker}; +constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); +constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, + 0x2028, 0x202A, kRangeEndMarker}; +constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); + +// More makes code generation slower, less makes V8 benchmark score lower. +constexpr int kMaxLookaheadForBoyerMoore = 8; +// In a 3-character pattern you can maximally step forwards 3 characters +// at a time, which is not always enough to pay for the extra logic. +constexpr int kPatternTooShortForBoyerMoore = 2; + +} // namespace regexp_compiler_constants + +inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) { + // Both unicode (or unicode sets) and ignore_case flags are set. We need to + // use ICU to find the closure over case equivalents. + return IsEitherUnicode(flags) && IsIgnoreCase(flags); +} + +// Details of a quick mask-compare check that can look ahead in the +// input stream. +class QuickCheckDetails { + public: + QuickCheckDetails() + : characters_(0), mask_(0), value_(0), cannot_match_(false) {} + explicit QuickCheckDetails(int characters) + : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} + bool Rationalize(bool one_byte); + // Merge in the information from another branch of an alternation. + void Merge(QuickCheckDetails* other, int from_index); + // Advance the current position by some amount. + void Advance(int by, bool one_byte); + void Clear(); + bool cannot_match() { return cannot_match_; } + void set_cannot_match() { cannot_match_ = true; } + struct Position { + Position() : mask(0), value(0), determines_perfectly(false) {} + base::uc32 mask; + base::uc32 value; + bool determines_perfectly; + }; + int characters() { return characters_; } + void set_characters(int characters) { characters_ = characters; } + Position* positions(int index) { + DCHECK_LE(0, index); + DCHECK_GT(characters_, index); + return positions_ + index; + } + uint32_t mask() { return mask_; } + uint32_t value() { return value_; } + + private: + // How many characters do we have quick check information from. This is + // the same for all branches of a choice node. + int characters_; + Position positions_[4]; + // These values are the condensate of the above array after Rationalize(). + uint32_t mask_; + uint32_t value_; + // If set to true, there is no way this quick check can match at all. + // E.g., if it requires to be at the start of the input, and isn't. + bool cannot_match_; +}; + +// Improve the speed that we scan for an initial point where a non-anchored +// regexp can match by using a Boyer-Moore-like table. This is done by +// identifying non-greedy non-capturing loops in the nodes that eat any +// character one at a time. For example in the middle of the regexp +// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly +// inserted at the start of any non-anchored regexp. +// +// When we have found such a loop we look ahead in the nodes to find the set of +// characters that can come at given distances. For example for the regexp +// /.?foo/ we know that there are at least 3 characters ahead of us, and the +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in +// the lookahead info where the set of characters is reasonably constrained. In +// our example this is from index 1 to 2 (0 is not constrained). We can now +// look 3 characters ahead and if we don't find one of [f, o] (the union of +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). +// +// For Unicode input strings we do the same, but modulo 128. +// +// We also look at the first string fed to the regexp and use that to get a hint +// of the character frequencies in the inputs. This affects the assessment of +// whether the set of characters is 'reasonably constrained'. +// +// We also have another lookahead mechanism (called quick check in the code), +// which uses a wide load of multiple characters followed by a mask and compare +// to determine whether a match is possible at this point. +enum ContainedInLattice { + kNotYet = 0, + kLatticeIn = 1, + kLatticeOut = 2, + kLatticeUnknown = 3 // Can also mean both in and out. +}; + +inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { + return static_cast<ContainedInLattice>(a | b); +} + +class BoyerMoorePositionInfo : public ZoneObject { + public: + bool at(int i) const { return map_[i]; } + + static constexpr int kMapSize = 128; + static constexpr int kMask = kMapSize - 1; + + int map_count() const { return map_count_; } + + void Set(int character); + void SetInterval(const Interval& interval); + void SetAll(); + + bool is_non_word() { return w_ == kLatticeOut; } + bool is_word() { return w_ == kLatticeIn; } + + using Bitset = std::bitset<kMapSize>; + Bitset raw_bitset() const { return map_; } + + private: + Bitset map_; + int map_count_ = 0; // Number of set bits in the map. + ContainedInLattice w_ = kNotYet; // The \w character class. +}; + +class BoyerMooreLookahead : public ZoneObject { + public: + BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); + + int length() { return length_; } + int max_char() { return max_char_; } + RegExpCompiler* compiler() { return compiler_; } + + int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } + + BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } + + void Set(int map_number, int character) { + if (character > max_char_) return; + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); + info->Set(character); + } + + void SetInterval(int map_number, const Interval& interval) { + if (interval.from() > max_char_) return; + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); + if (interval.to() > max_char_) { + info->SetInterval(Interval(interval.from(), max_char_)); + } else { + info->SetInterval(interval); + } + } + + void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } + + void SetRest(int from_map) { + for (int i = from_map; i < length_; i++) SetAll(i); + } + void EmitSkipInstructions(RegExpMacroAssembler* masm); + + private: + // This is the value obtained by EatsAtLeast. If we do not have at least this + // many characters left in the sample string then the match is bound to fail. + // Therefore it is OK to read a character this far ahead of the current match + // point. + int length_; + RegExpCompiler* compiler_; + // 0xff for Latin1, 0xffff for UTF-16. + int max_char_; + ZoneList<BoyerMoorePositionInfo*>* bitmaps_; + + int GetSkipTable(int min_lookahead, int max_lookahead, + Handle<ByteArray> boolean_skip_table); + bool FindWorthwhileInterval(int* from, int* to); + int FindBestInterval(int max_number_of_chars, int old_biggest_points, + int* from, int* to); +}; + +// There are many ways to generate code for a node. This class encapsulates +// the current way we should be generating. In other words it encapsulates +// the current state of the code generator. The effect of this is that we +// generate code for paths that the matcher can take through the regular +// expression. A given node in the regexp can be code-generated several times +// as it can be part of several traces. For example for the regexp: +// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part +// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code +// to match foo is generated only once (the traces have a common prefix). The +// code to store the capture is deferred and generated (twice) after the places +// where baz has been matched. +class Trace { + public: + // A value for a property that is either known to be true, know to be false, + // or not known. + enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; + + class DeferredAction { + public: + DeferredAction(ActionNode::ActionType action_type, int reg) + : action_type_(action_type), reg_(reg), next_(nullptr) {} + DeferredAction* next() { return next_; } + bool Mentions(int reg); + int reg() { return reg_; } + ActionNode::ActionType action_type() { return action_type_; } + + private: + ActionNode::ActionType action_type_; + int reg_; + DeferredAction* next_; + friend class Trace; + }; + + class DeferredCapture : public DeferredAction { + public: + DeferredCapture(int reg, bool is_capture, Trace* trace) + : DeferredAction(ActionNode::STORE_POSITION, reg), + cp_offset_(trace->cp_offset()), + is_capture_(is_capture) {} + int cp_offset() { return cp_offset_; } + bool is_capture() { return is_capture_; } + + private: + int cp_offset_; + bool is_capture_; + void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } + }; + + class DeferredSetRegisterForLoop : public DeferredAction { + public: + DeferredSetRegisterForLoop(int reg, int value) + : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg), + value_(value) {} + int value() { return value_; } + + private: + int value_; + }; + + class DeferredClearCaptures : public DeferredAction { + public: + explicit DeferredClearCaptures(Interval range) + : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {} + Interval range() { return range_; } + + private: + Interval range_; + }; + + class DeferredIncrementRegister : public DeferredAction { + public: + explicit DeferredIncrementRegister(int reg) + : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {} + }; + + Trace() + : cp_offset_(0), + actions_(nullptr), + backtrack_(nullptr), + stop_node_(nullptr), + loop_label_(nullptr), + characters_preloaded_(0), + bound_checked_up_to_(0), + flush_budget_(100), + at_start_(UNKNOWN) {} + + // End the trace. This involves flushing the deferred actions in the trace + // and pushing a backtrack location onto the backtrack stack. Once this is + // done we can start a new trace or go to one that has already been + // generated. + void Flush(RegExpCompiler* compiler, RegExpNode* successor); + int cp_offset() { return cp_offset_; } + DeferredAction* actions() { return actions_; } + // A trivial trace is one that has no deferred actions or other state that + // affects the assumptions used when generating code. There is no recorded + // backtrack location in a trivial trace, so with a trivial trace we will + // generate code that, on a failure to match, gets the backtrack location + // from the backtrack stack rather than using a direct jump instruction. We + // always start code generation with a trivial trace and non-trivial traces + // are created as we emit code for nodes or add to the list of deferred + // actions in the trace. The location of the code generated for a node using + // a trivial trace is recorded in a label in the node so that gotos can be + // generated to that code. + bool is_trivial() { + return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && + characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && + quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; + } + TriBool at_start() { return at_start_; } + void set_at_start(TriBool at_start) { at_start_ = at_start; } + Label* backtrack() { return backtrack_; } + Label* loop_label() { return loop_label_; } + RegExpNode* stop_node() { return stop_node_; } + int characters_preloaded() { return characters_preloaded_; } + int bound_checked_up_to() { return bound_checked_up_to_; } + int flush_budget() { return flush_budget_; } + QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } + bool mentions_reg(int reg); + // Returns true if a deferred position store exists to the specified + // register and stores the offset in the out-parameter. Otherwise + // returns false. + bool GetStoredPosition(int reg, int* cp_offset); + // These set methods and AdvanceCurrentPositionInTrace should be used only on + // new traces - the intention is that traces are immutable after creation. + void add_action(DeferredAction* new_action) { + DCHECK(new_action->next_ == nullptr); + new_action->next_ = actions_; + actions_ = new_action; + } + void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } + void set_stop_node(RegExpNode* node) { stop_node_ = node; } + void set_loop_label(Label* label) { loop_label_ = label; } + void set_characters_preloaded(int count) { characters_preloaded_ = count; } + void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } + void set_flush_budget(int to) { flush_budget_ = to; } + void set_quick_check_performed(QuickCheckDetails* d) { + quick_check_performed_ = *d; + } + void InvalidateCurrentCharacter(); + void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); + + private: + int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); + void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, + const DynamicBitSet& affected_registers, + DynamicBitSet* registers_to_pop, + DynamicBitSet* registers_to_clear, Zone* zone); + void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, + const DynamicBitSet& registers_to_pop, + const DynamicBitSet& registers_to_clear); + int cp_offset_; + DeferredAction* actions_; + Label* backtrack_; + RegExpNode* stop_node_; + Label* loop_label_; + int characters_preloaded_; + int bound_checked_up_to_; + QuickCheckDetails quick_check_performed_; + int flush_budget_; + TriBool at_start_; +}; + +class GreedyLoopState { + public: + explicit GreedyLoopState(bool not_at_start); + + Label* label() { return &label_; } + Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } + + private: + Label label_; + Trace counter_backtrack_trace_; +}; + +struct PreloadState { + static const int kEatsAtLeastNotYetInitialized = -1; + bool preload_is_current_; + bool preload_has_checked_bounds_; + int preload_characters_; + int eats_at_least_; + void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } +}; + +// Analysis performs assertion propagation and computes eats_at_least_ values. +// See the comments on AssertionPropagator and EatsAtLeastPropagator for more +// details. +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, + RegExpNode* node); + +class FrequencyCollator { + public: + FrequencyCollator() : total_samples_(0) { + for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { + frequencies_[i] = CharacterFrequency(i); + } + } + + void CountCharacter(int character) { + int index = (character & RegExpMacroAssembler::kTableMask); + frequencies_[index].Increment(); + total_samples_++; + } + + // Does not measure in percent, but rather per-128 (the table size from the + // regexp macro assembler). + int Frequency(int in_character) { + DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); + if (total_samples_ < 1) return 1; // Division by zero. + int freq_in_per128 = + (frequencies_[in_character].counter() * 128) / total_samples_; + return freq_in_per128; + } + + private: + class CharacterFrequency { + public: + CharacterFrequency() : counter_(0), character_(-1) {} + explicit CharacterFrequency(int character) + : counter_(0), character_(character) {} + + void Increment() { counter_++; } + int counter() { return counter_; } + int character() { return character_; } + + private: + int counter_; + int character_; + }; + + private: + CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; + int total_samples_; +}; + +class RegExpCompiler { + public: + RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, + RegExpFlags flags, bool is_one_byte); + + int AllocateRegister() { + if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { + reg_exp_too_big_ = true; + return next_register_; + } + return next_register_++; + } + + // Lookarounds to match lone surrogates for unicode character class matches + // are never nested. We can therefore reuse registers. + int UnicodeLookaroundStackRegister() { + if (unicode_lookaround_stack_register_ == kNoRegister) { + unicode_lookaround_stack_register_ = AllocateRegister(); + } + return unicode_lookaround_stack_register_; + } + + int UnicodeLookaroundPositionRegister() { + if (unicode_lookaround_position_register_ == kNoRegister) { + unicode_lookaround_position_register_ = AllocateRegister(); + } + return unicode_lookaround_position_register_; + } + + struct CompilationResult final { + explicit CompilationResult(RegExpError err) : error(err) {} + CompilationResult(Handle<Object> code, int registers) + : code(code), num_registers(registers) {} + + static CompilationResult RegExpTooBig() { + return CompilationResult(RegExpError::kTooLarge); + } + + bool Succeeded() const { return error == RegExpError::kNone; } + + const RegExpError error = RegExpError::kNone; + Handle<Object> code; + int num_registers = 0; + }; + + CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, + RegExpNode* start, int capture_count, + Handle<String> pattern); + + // Preprocessing is the final step of node creation before analysis + // and assembly. It includes: + // - Wrapping the body of the regexp in capture 0. + // - Inserting the implicit .* before/after the regexp if necessary. + // - If the input is a one-byte string, filtering out nodes that can't match. + // - Fixing up regexp matches that start within a surrogate pair. + RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags, + bool is_one_byte); + + // If the regexp matching starts within a surrogate pair, step back to the + // lead surrogate and start matching from there. + RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success); + + inline void AddWork(RegExpNode* node) { + if (!node->on_work_list() && !node->label()->is_bound()) { + node->set_on_work_list(true); + work_list_->push_back(node); + } + } + + static const int kImplementationOffset = 0; + static const int kNumberOfRegistersOffset = 0; + static const int kCodeOffset = 1; + + RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } + EndNode* accept() { return accept_; } + + static const int kMaxRecursion = 100; + inline int recursion_depth() { return recursion_depth_; } + inline void IncrementRecursionDepth() { recursion_depth_++; } + inline void DecrementRecursionDepth() { recursion_depth_--; } + + RegExpFlags flags() const { return flags_; } + + void SetRegExpTooBig() { reg_exp_too_big_ = true; } + + inline bool one_byte() { return one_byte_; } + inline bool optimize() { return optimize_; } + inline void set_optimize(bool value) { optimize_ = value; } + inline bool limiting_recursion() { return limiting_recursion_; } + inline void set_limiting_recursion(bool value) { + limiting_recursion_ = value; + } + bool read_backward() { return read_backward_; } + void set_read_backward(bool value) { read_backward_ = value; } + FrequencyCollator* frequency_collator() { return &frequency_collator_; } + + int current_expansion_factor() { return current_expansion_factor_; } + void set_current_expansion_factor(int value) { + current_expansion_factor_ = value; + } + + // The recursive nature of ToNode node generation means we may run into stack + // overflow issues. We introduce periodic checks to detect these, and the + // tick counter helps limit overhead of these checks. + // TODO(jgruber): This is super hacky and should be replaced by an abort + // mechanism or iterative node generation. + void ToNodeMaybeCheckForStackOverflow() { + if ((to_node_overflow_check_ticks_++ % 16 == 0)) { + ToNodeCheckForStackOverflow(); + } + } + void ToNodeCheckForStackOverflow(); + + Isolate* isolate() const { return isolate_; } + Zone* zone() const { return zone_; } + + static const int kNoRegister = -1; + + private: + EndNode* accept_; + int next_register_; + int unicode_lookaround_stack_register_; + int unicode_lookaround_position_register_; + ZoneVector<RegExpNode*>* work_list_; + int recursion_depth_; + const RegExpFlags flags_; + RegExpMacroAssembler* macro_assembler_; + bool one_byte_; + bool reg_exp_too_big_; + bool limiting_recursion_; + int to_node_overflow_check_ticks_ = 0; + bool optimize_; + bool read_backward_; + int current_expansion_factor_; + FrequencyCollator frequency_collator_; + Isolate* isolate_; + Zone* zone_; +}; + +// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. +class UnicodeRangeSplitter { + public: + V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base); + + static constexpr int kInitialSize = 8; + using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>; + + const CharacterRangeVector* bmp() const { return &bmp_; } + const CharacterRangeVector* lead_surrogates() const { + return &lead_surrogates_; + } + const CharacterRangeVector* trail_surrogates() const { + return &trail_surrogates_; + } + const CharacterRangeVector* non_bmp() const { return &non_bmp_; } + + private: + void AddRange(CharacterRange range); + + CharacterRangeVector bmp_; + CharacterRangeVector lead_surrogates_; + CharacterRangeVector trail_surrogates_; + CharacterRangeVector non_bmp_; +}; + +// We need to check for the following characters: 0x39C 0x3BC 0x178. +// TODO(jgruber): Move to CharacterRange. +bool RangeContainsLatin1Equivalents(CharacterRange range); + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_COMPILER_H_ diff --git a/js/src/irregexp/imported/regexp-dotprinter.cc b/js/src/irregexp/imported/regexp-dotprinter.cc new file mode 100644 index 0000000000..6746992a0a --- /dev/null +++ b/js/src/irregexp/imported/regexp-dotprinter.cc @@ -0,0 +1,249 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-dotprinter.h" + +#include "irregexp/imported/regexp-compiler.h" + +namespace v8 { +namespace internal { + +// ------------------------------------------------------------------- +// Dot/dotty output + +class DotPrinterImpl : public NodeVisitor { + public: + explicit DotPrinterImpl(std::ostream& os) : os_(os) {} + void PrintNode(const char* label, RegExpNode* node); + void Visit(RegExpNode* node); + void PrintAttributes(RegExpNode* from); + void PrintOnFailure(RegExpNode* from, RegExpNode* to); +#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that); + FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT + private: + std::ostream& os_; +}; + +void DotPrinterImpl::PrintNode(const char* label, RegExpNode* node) { + os_ << "digraph G {\n graph [label=\""; + for (int i = 0; label[i]; i++) { + switch (label[i]) { + case '\\': + os_ << "\\\\"; + break; + case '"': + os_ << "\""; + break; + default: + os_ << label[i]; + break; + } + } + os_ << "\"];\n"; + Visit(node); + os_ << "}" << std::endl; +} + +void DotPrinterImpl::Visit(RegExpNode* node) { + if (node->info()->visited) return; + node->info()->visited = true; + node->Accept(this); +} + +void DotPrinterImpl::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { + os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; + Visit(on_failure); +} + +class AttributePrinter { + public: + explicit AttributePrinter(std::ostream& os) : os_(os), first_(true) {} + void PrintSeparator() { + if (first_) { + first_ = false; + } else { + os_ << "|"; + } + } + void PrintBit(const char* name, bool value) { + if (!value) return; + PrintSeparator(); + os_ << "{" << name << "}"; + } + void PrintPositive(const char* name, int value) { + if (value < 0) return; + PrintSeparator(); + os_ << "{" << name << "|" << value << "}"; + } + + private: + std::ostream& os_; + bool first_; +}; + +void DotPrinterImpl::PrintAttributes(RegExpNode* that) { + os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " + << "margin=0.1, fontsize=10, label=\"{"; + AttributePrinter printer(os_); + NodeInfo* info = that->info(); + printer.PrintBit("NI", info->follows_newline_interest); + printer.PrintBit("WI", info->follows_word_interest); + printer.PrintBit("SI", info->follows_start_interest); + Label* label = that->label(); + if (label->is_bound()) printer.PrintPositive("@", label->pos()); + os_ << "}\"];\n" + << " a" << that << " -> n" << that + << " [style=dashed, color=grey, arrowhead=none];\n"; +} + +void DotPrinterImpl::VisitChoice(ChoiceNode* that) { + os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; + for (int i = 0; i < that->alternatives()->length(); i++) { + GuardedAlternative alt = that->alternatives()->at(i); + os_ << " n" << that << " -> n" << alt.node(); + } + for (int i = 0; i < that->alternatives()->length(); i++) { + GuardedAlternative alt = that->alternatives()->at(i); + alt.node()->Accept(this); + } +} + +void DotPrinterImpl::VisitLoopChoice(LoopChoiceNode* that) { + VisitChoice(that); +} + +void DotPrinterImpl::VisitNegativeLookaroundChoice( + NegativeLookaroundChoiceNode* that) { + VisitChoice(that); +} + +void DotPrinterImpl::VisitText(TextNode* that) { + Zone* zone = that->zone(); + os_ << " n" << that << " [label=\""; + for (int i = 0; i < that->elements()->length(); i++) { + if (i > 0) os_ << " "; + TextElement elm = that->elements()->at(i); + switch (elm.text_type()) { + case TextElement::ATOM: { + base::Vector<const base::uc16> data = elm.atom()->data(); + for (int j = 0; j < data.length(); j++) { + os_ << static_cast<char>(data[j]); + } + break; + } + case TextElement::CLASS_RANGES: { + RegExpClassRanges* node = elm.class_ranges(); + os_ << "["; + if (node->is_negated()) os_ << "^"; + for (int j = 0; j < node->ranges(zone)->length(); j++) { + CharacterRange range = node->ranges(zone)->at(j); + os_ << AsUC32(range.from()) << "-" << AsUC32(range.to()); + } + os_ << "]"; + break; + } + default: + UNREACHABLE(); + } + } + os_ << "\", shape=box, peripheries=2];\n"; + PrintAttributes(that); + os_ << " n" << that << " -> n" << that->on_success() << ";\n"; + Visit(that->on_success()); +} + +void DotPrinterImpl::VisitBackReference(BackReferenceNode* that) { + os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" + << that->end_register() << "\", shape=doubleoctagon];\n"; + PrintAttributes(that); + os_ << " n" << that << " -> n" << that->on_success() << ";\n"; + Visit(that->on_success()); +} + +void DotPrinterImpl::VisitEnd(EndNode* that) { + os_ << " n" << that << " [style=bold, shape=point];\n"; + PrintAttributes(that); +} + +void DotPrinterImpl::VisitAssertion(AssertionNode* that) { + os_ << " n" << that << " ["; + switch (that->assertion_type()) { + case AssertionNode::AT_END: + os_ << "label=\"$\", shape=septagon"; + break; + case AssertionNode::AT_START: + os_ << "label=\"^\", shape=septagon"; + break; + case AssertionNode::AT_BOUNDARY: + os_ << "label=\"\\b\", shape=septagon"; + break; + case AssertionNode::AT_NON_BOUNDARY: + os_ << "label=\"\\B\", shape=septagon"; + break; + case AssertionNode::AFTER_NEWLINE: + os_ << "label=\"(?<=\\n)\", shape=septagon"; + break; + } + os_ << "];\n"; + PrintAttributes(that); + RegExpNode* successor = that->on_success(); + os_ << " n" << that << " -> n" << successor << ";\n"; + Visit(successor); +} + +void DotPrinterImpl::VisitAction(ActionNode* that) { + os_ << " n" << that << " ["; + switch (that->action_type_) { + case ActionNode::SET_REGISTER_FOR_LOOP: + os_ << "label=\"$" << that->data_.u_store_register.reg + << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; + break; + case ActionNode::INCREMENT_REGISTER: + os_ << "label=\"$" << that->data_.u_increment_register.reg + << "++\", shape=octagon"; + break; + case ActionNode::STORE_POSITION: + os_ << "label=\"$" << that->data_.u_position_register.reg + << ":=$pos\", shape=octagon"; + break; + case ActionNode::BEGIN_POSITIVE_SUBMATCH: + os_ << "label=\"$" << that->data_.u_submatch.current_position_register + << ":=$pos,begin-positive\", shape=septagon"; + break; + case ActionNode::BEGIN_NEGATIVE_SUBMATCH: + os_ << "label=\"$" << that->data_.u_submatch.current_position_register + << ":=$pos,begin-negative\", shape=septagon"; + break; + case ActionNode::POSITIVE_SUBMATCH_SUCCESS: + os_ << "label=\"escape\", shape=septagon"; + break; + case ActionNode::EMPTY_MATCH_CHECK: + os_ << "label=\"$" << that->data_.u_empty_match_check.start_register + << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register + << "<" << that->data_.u_empty_match_check.repetition_limit + << "?\", shape=septagon"; + break; + case ActionNode::CLEAR_CAPTURES: { + os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from + << " to $" << that->data_.u_clear_captures.range_to + << "\", shape=septagon"; + break; + } + } + os_ << "];\n"; + PrintAttributes(that); + RegExpNode* successor = that->on_success(); + os_ << " n" << that << " -> n" << successor << ";\n"; + Visit(successor); +} + +void DotPrinter::DotPrint(const char* label, RegExpNode* node) { + StdoutStream os; + DotPrinterImpl printer(os); + printer.PrintNode(label, node); +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-dotprinter.h b/js/src/irregexp/imported/regexp-dotprinter.h new file mode 100644 index 0000000000..7fcece6e1a --- /dev/null +++ b/js/src/irregexp/imported/regexp-dotprinter.h @@ -0,0 +1,23 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_DOTPRINTER_H_ +#define V8_REGEXP_REGEXP_DOTPRINTER_H_ + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class RegExpNode; + +class DotPrinter final : public AllStatic { + public: + static void DotPrint(const char* label, RegExpNode* node); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_DOTPRINTER_H_ diff --git a/js/src/irregexp/imported/regexp-error.cc b/js/src/irregexp/imported/regexp-error.cc new file mode 100644 index 0000000000..d0b4c263a4 --- /dev/null +++ b/js/src/irregexp/imported/regexp-error.cc @@ -0,0 +1,22 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-error.h" + +namespace v8 { +namespace internal { + +const char* const kRegExpErrorStrings[] = { +#define TEMPLATE(NAME, STRING) STRING, + REGEXP_ERROR_MESSAGES(TEMPLATE) +#undef TEMPLATE +}; + +const char* RegExpErrorString(RegExpError error) { + DCHECK_LT(error, RegExpError::NumErrors); + return kRegExpErrorStrings[static_cast<int>(error)]; +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-error.h b/js/src/irregexp/imported/regexp-error.h new file mode 100644 index 0000000000..ff4fe41cd5 --- /dev/null +++ b/js/src/irregexp/imported/regexp-error.h @@ -0,0 +1,67 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_ERROR_H_ +#define V8_REGEXP_REGEXP_ERROR_H_ + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +#define REGEXP_ERROR_MESSAGES(T) \ + T(None, "") \ + T(StackOverflow, "Maximum call stack size exceeded") \ + T(AnalysisStackOverflow, "Stack overflow") \ + T(TooLarge, "Regular expression too large") \ + T(UnterminatedGroup, "Unterminated group") \ + T(UnmatchedParen, "Unmatched ')'") \ + T(EscapeAtEndOfPattern, "\\ at end of pattern") \ + T(InvalidPropertyName, "Invalid property name") \ + T(InvalidEscape, "Invalid escape") \ + T(InvalidDecimalEscape, "Invalid decimal escape") \ + T(InvalidUnicodeEscape, "Invalid Unicode escape") \ + T(NothingToRepeat, "Nothing to repeat") \ + T(LoneQuantifierBrackets, "Lone quantifier brackets") \ + T(RangeOutOfOrder, "numbers out of order in {} quantifier") \ + T(IncompleteQuantifier, "Incomplete quantifier") \ + T(InvalidQuantifier, "Invalid quantifier") \ + T(InvalidGroup, "Invalid group") \ + T(MultipleFlagDashes, "Multiple dashes in flag group") \ + T(NotLinear, "Cannot be executed in linear time") \ + T(RepeatedFlag, "Repeated flag in flag group") \ + T(InvalidFlagGroup, "Invalid flag group") \ + T(TooManyCaptures, "Too many captures") \ + T(InvalidCaptureGroupName, "Invalid capture group name") \ + T(DuplicateCaptureGroupName, "Duplicate capture group name") \ + T(InvalidNamedReference, "Invalid named reference") \ + T(InvalidNamedCaptureReference, "Invalid named capture referenced") \ + T(InvalidClassEscape, "Invalid class escape") \ + T(InvalidClassPropertyName, "Invalid property name in character class") \ + T(InvalidCharacterClass, "Invalid character class") \ + T(UnterminatedCharacterClass, "Unterminated character class") \ + T(OutOfOrderCharacterClass, "Range out of order in character class") \ + T(InvalidClassSetOperation, "Invalid set operation in character class") \ + T(InvalidCharacterInClass, "Invalid character in character class") \ + T(NegatedCharacterClassWithStrings, \ + "Negated character class may contain strings") + +enum class RegExpError : uint32_t { +#define TEMPLATE(NAME, STRING) k##NAME, + REGEXP_ERROR_MESSAGES(TEMPLATE) +#undef TEMPLATE + NumErrors +}; + +V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error); + +inline constexpr bool RegExpErrorIsStackOverflow(RegExpError error) { + return error == RegExpError::kStackOverflow || + error == RegExpError::kAnalysisStackOverflow; +} + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_ERROR_H_ diff --git a/js/src/irregexp/imported/regexp-interpreter.cc b/js/src/irregexp/imported/regexp-interpreter.cc new file mode 100644 index 0000000000..859fa53c0b --- /dev/null +++ b/js/src/irregexp/imported/regexp-interpreter.cc @@ -0,0 +1,1147 @@ +// Copyright 2011 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// A simple interpreter for the Irregexp byte code. + +#include "irregexp/imported/regexp-interpreter.h" + +#include "irregexp/imported/regexp-bytecodes.h" +#include "irregexp/imported/regexp-macro-assembler.h" +#include "irregexp/imported/regexp-stack.h" // For kMaximumStackSize. +#include "irregexp/imported/regexp.h" + +#ifdef V8_INTL_SUPPORT +#include "unicode/uchar.h" +#endif // V8_INTL_SUPPORT + +// Use token threaded dispatch iff the compiler supports computed gotos and the +// build argument v8_enable_regexp_interpreter_threaded_dispatch was set. +#if V8_HAS_COMPUTED_GOTO && \ + defined(V8_ENABLE_REGEXP_INTERPRETER_THREADED_DISPATCH) +#define V8_USE_COMPUTED_GOTO 1 +#endif // V8_HAS_COMPUTED_GOTO + +namespace v8 { +namespace internal { + +namespace { + +bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, + base::Vector<const base::uc16> subject, + bool unicode) { + Address offset_a = + reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(from))); + Address offset_b = + reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(current))); + size_t length = len * base::kUC16Size; + + bool result = unicode + ? RegExpMacroAssembler::CaseInsensitiveCompareUnicode( + offset_a, offset_b, length, isolate) + : RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( + offset_a, offset_b, length, isolate); + return result == 1; +} + +bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, + base::Vector<const uint8_t> subject, bool unicode) { + // For Latin1 characters the unicode flag makes no difference. + for (int i = 0; i < len; i++) { + unsigned int old_char = subject[from++]; + unsigned int new_char = subject[current++]; + if (old_char == new_char) continue; + // Convert both characters to lower case. + old_char |= 0x20; + new_char |= 0x20; + if (old_char != new_char) return false; + // Not letters in the ASCII range and Latin-1 range. + if (!(old_char - 'a' <= 'z' - 'a') && + !(old_char - 224 <= 254 - 224 && old_char != 247)) { + return false; + } + } + return true; +} + +#ifdef DEBUG +void MaybeTraceInterpreter(const byte* code_base, const byte* pc, + int stack_depth, int current_position, + uint32_t current_char, int bytecode_length, + const char* bytecode_name) { + if (v8_flags.trace_regexp_bytecodes) { + const bool printable = std::isprint(current_char); + const char* format = + printable + ? "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = " + : "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = "; + PrintF(format, pc - code_base, stack_depth, current_position, current_char, + printable ? current_char : '.'); + + RegExpBytecodeDisassembleSingle(code_base, pc); + } +} +#endif // DEBUG + +int32_t Load32Aligned(const byte* pc) { + DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 3); + return *reinterpret_cast<const int32_t*>(pc); +} + +// TODO(jgruber): Rename to Load16AlignedUnsigned. +uint32_t Load16Aligned(const byte* pc) { + DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1); + return *reinterpret_cast<const uint16_t*>(pc); +} + +int32_t Load16AlignedSigned(const byte* pc) { + DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1); + return *reinterpret_cast<const int16_t*>(pc); +} + +// Helpers to access the packed argument. Takes the 32 bits containing the +// current bytecode, where the 8 LSB contain the bytecode and the rest contains +// a packed 24-bit argument. +// TODO(jgruber): Specify signed-ness in bytecode signature declarations, and +// police restrictions during bytecode generation. +int32_t LoadPacked24Signed(int32_t bytecode_and_packed_arg) { + return bytecode_and_packed_arg >> BYTECODE_SHIFT; +} +uint32_t LoadPacked24Unsigned(int32_t bytecode_and_packed_arg) { + return static_cast<uint32_t>(bytecode_and_packed_arg) >> BYTECODE_SHIFT; +} + +// A simple abstraction over the backtracking stack used by the interpreter. +// +// Despite the name 'backtracking' stack, it's actually used as a generic stack +// that stores both program counters (= offsets into the bytecode) and generic +// integer values. +class BacktrackStack { + public: + BacktrackStack() = default; + BacktrackStack(const BacktrackStack&) = delete; + BacktrackStack& operator=(const BacktrackStack&) = delete; + + V8_WARN_UNUSED_RESULT bool push(int v) { + data_.emplace_back(v); + return (static_cast<int>(data_.size()) <= kMaxSize); + } + int peek() const { + DCHECK(!data_.empty()); + return data_.back(); + } + int pop() { + int v = peek(); + data_.pop_back(); + return v; + } + + // The 'sp' is the index of the first empty element in the stack. + int sp() const { return static_cast<int>(data_.size()); } + void set_sp(int new_sp) { + DCHECK_LE(new_sp, sp()); + data_.resize_no_init(new_sp); + } + + private: + // Semi-arbitrary. Should be large enough for common cases to remain in the + // static stack-allocated backing store, but small enough not to waste space. + static constexpr int kStaticCapacity = 64; + + using ValueT = int; + base::SmallVector<ValueT, kStaticCapacity> data_; + + static constexpr int kMaxSize = + RegExpStack::kMaximumStackSize / sizeof(ValueT); +}; + +// Registers used during interpreter execution. These consist of output +// registers in indices [0, output_register_count[ which will contain matcher +// results as a {start,end} index tuple for each capture (where the whole match +// counts as implicit capture 0); and internal registers in indices +// [output_register_count, total_register_count[. +class InterpreterRegisters { + public: + using RegisterT = int; + + InterpreterRegisters(int total_register_count, RegisterT* output_registers, + int output_register_count) + : registers_(total_register_count), + output_registers_(output_registers), + output_register_count_(output_register_count) { + // TODO(jgruber): Use int32_t consistently for registers. Currently, CSA + // uses int32_t while runtime uses int. + static_assert(sizeof(int) == sizeof(int32_t)); + DCHECK_GE(output_register_count, 2); // At least 2 for the match itself. + DCHECK_GE(total_register_count, output_register_count); + DCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount); + DCHECK_NOT_NULL(output_registers); + + // Initialize the output register region to -1 signifying 'no match'. + std::memset(registers_.data(), -1, + output_register_count * sizeof(RegisterT)); + } + + const RegisterT& operator[](size_t index) const { return registers_[index]; } + RegisterT& operator[](size_t index) { return registers_[index]; } + + void CopyToOutputRegisters() { + MemCopy(output_registers_, registers_.data(), + output_register_count_ * sizeof(RegisterT)); + } + + private: + static constexpr int kStaticCapacity = 64; // Arbitrary. + base::SmallVector<RegisterT, kStaticCapacity> registers_; + RegisterT* const output_registers_; + const int output_register_count_; +}; + +IrregexpInterpreter::Result ThrowStackOverflow(Isolate* isolate, + RegExp::CallOrigin call_origin) { + CHECK(call_origin == RegExp::CallOrigin::kFromRuntime); + // We abort interpreter execution after the stack overflow is thrown, and thus + // allow allocation here despite the outer DisallowGarbageCollectionScope. + AllowGarbageCollection yes_gc; + isolate->StackOverflow(); + return IrregexpInterpreter::EXCEPTION; +} + +// Only throws if called from the runtime, otherwise just returns the EXCEPTION +// status code. +IrregexpInterpreter::Result MaybeThrowStackOverflow( + Isolate* isolate, RegExp::CallOrigin call_origin) { + if (call_origin == RegExp::CallOrigin::kFromRuntime) { + return ThrowStackOverflow(isolate, call_origin); + } else { + return IrregexpInterpreter::EXCEPTION; + } +} + +template <typename Char> +void UpdateCodeAndSubjectReferences( + Isolate* isolate, Handle<ByteArray> code_array, + Handle<String> subject_string, ByteArray* code_array_out, + const byte** code_base_out, const byte** pc_out, String* subject_string_out, + base::Vector<const Char>* subject_string_vector_out) { + DisallowGarbageCollection no_gc; + + if (*code_base_out != code_array->GetDataStartAddress()) { + *code_array_out = *code_array; + const intptr_t pc_offset = *pc_out - *code_base_out; + DCHECK_GT(pc_offset, 0); + *code_base_out = code_array->GetDataStartAddress(); + *pc_out = *code_base_out + pc_offset; + } + + DCHECK(subject_string->IsFlat()); + *subject_string_out = *subject_string; + *subject_string_vector_out = subject_string->GetCharVector<Char>(no_gc); +} + +// Runs all pending interrupts and updates unhandlified object references if +// necessary. +template <typename Char> +IrregexpInterpreter::Result HandleInterrupts( + Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out, + String* subject_string_out, const byte** code_base_out, + base::Vector<const Char>* subject_string_vector_out, const byte** pc_out) { + DisallowGarbageCollection no_gc; + + StackLimitCheck check(isolate); + bool js_has_overflowed = check.JsHasOverflowed(); + + if (call_origin == RegExp::CallOrigin::kFromJs) { + // Direct calls from JavaScript can be interrupted in two ways: + // 1. A real stack overflow, in which case we let the caller throw the + // exception. + // 2. The stack guard was used to interrupt execution for another purpose, + // forcing the call through the runtime system. + if (js_has_overflowed) { + return IrregexpInterpreter::EXCEPTION; + } else if (check.InterruptRequested()) { + return IrregexpInterpreter::RETRY; + } + } else { + DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime); + // Prepare for possible GC. + HandleScope handles(isolate); + Handle<ByteArray> code_handle(*code_array_out, isolate); + Handle<String> subject_handle(*subject_string_out, isolate); + + if (js_has_overflowed) { + return ThrowStackOverflow(isolate, call_origin); + } else if (check.InterruptRequested()) { + const bool was_one_byte = + String::IsOneByteRepresentationUnderneath(*subject_string_out); + Object result; + { + AllowGarbageCollection yes_gc; + result = isolate->stack_guard()->HandleInterrupts(); + } + if (result.IsException(isolate)) { + return IrregexpInterpreter::EXCEPTION; + } + + // If we changed between a LATIN1 and a UC16 string, we need to + // restart regexp matching with the appropriate template instantiation of + // RawMatch. + if (String::IsOneByteRepresentationUnderneath(*subject_handle) != + was_one_byte) { + return IrregexpInterpreter::RETRY; + } + + UpdateCodeAndSubjectReferences( + isolate, code_handle, subject_handle, code_array_out, code_base_out, + pc_out, subject_string_out, subject_string_vector_out); + } + } + + return IrregexpInterpreter::SUCCESS; +} + +bool CheckBitInTable(const uint32_t current_char, const byte* const table) { + int mask = RegExpMacroAssembler::kTableMask; + int b = table[(current_char & mask) >> kBitsPerByteLog2]; + int bit = (current_char & (kBitsPerByte - 1)); + return (b & (1 << bit)) != 0; +} + +// Returns true iff 0 <= index < length. +bool IndexIsInBounds(int index, int length) { + DCHECK_GE(length, 0); + return static_cast<uintptr_t>(index) < static_cast<uintptr_t>(length); +} + +// If computed gotos are supported by the compiler, we can get addresses to +// labels directly in C/C++. Every bytecode handler has its own label and we +// store the addresses in a dispatch table indexed by bytecode. To execute the +// next handler we simply jump (goto) directly to its address. +#if V8_USE_COMPUTED_GOTO +#define BC_LABEL(name) BC_##name: +#define DECODE() \ + do { \ + next_insn = Load32Aligned(next_pc); \ + next_handler_addr = dispatch_table[next_insn & BYTECODE_MASK]; \ + } while (false) +#define DISPATCH() \ + pc = next_pc; \ + insn = next_insn; \ + goto* next_handler_addr +// Without computed goto support, we fall back to a simple switch-based +// dispatch (A large switch statement inside a loop with a case for every +// bytecode). +#else // V8_USE_COMPUTED_GOTO +#define BC_LABEL(name) case BC_##name: +#define DECODE() next_insn = Load32Aligned(next_pc) +#define DISPATCH() \ + pc = next_pc; \ + insn = next_insn; \ + goto switch_dispatch_continuation +#endif // V8_USE_COMPUTED_GOTO + +// ADVANCE/SET_PC_FROM_OFFSET are separated from DISPATCH, because ideally some +// instructions can be executed between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH. +// We want those two macros as far apart as possible, because the goto in +// DISPATCH is dependent on a memory load in ADVANCE/SET_PC_FROM_OFFSET. If we +// don't hit the cache and have to fetch the next handler address from physical +// memory, instructions between ADVANCE/SET_PC_FROM_OFFSET and DISPATCH can +// potentially be executed unconditionally, reducing memory stall. +#define ADVANCE(name) \ + next_pc = pc + RegExpBytecodeLength(BC_##name); \ + DECODE() +#define SET_PC_FROM_OFFSET(offset) \ + next_pc = code_base + offset; \ + DECODE() + +// Current position mutations. +#define SET_CURRENT_POSITION(value) \ + do { \ + current = (value); \ + DCHECK(base::IsInRange(current, 0, subject.length())); \ + } while (false) +#define ADVANCE_CURRENT_POSITION(by) SET_CURRENT_POSITION(current + (by)) + +#ifdef DEBUG +#define BYTECODE(name) \ + BC_LABEL(name) \ + MaybeTraceInterpreter(code_base, pc, backtrack_stack.sp(), current, \ + current_char, RegExpBytecodeLength(BC_##name), #name); +#else +#define BYTECODE(name) BC_LABEL(name) +#endif // DEBUG + +template <typename Char> +IrregexpInterpreter::Result RawMatch( + Isolate* isolate, ByteArray code_array, String subject_string, + base::Vector<const Char> subject, int* output_registers, + int output_register_count, int total_register_count, int current, + uint32_t current_char, RegExp::CallOrigin call_origin, + const uint32_t backtrack_limit) { + DisallowGarbageCollection no_gc; + +#if V8_USE_COMPUTED_GOTO + +// We have to make sure that no OOB access to the dispatch table is possible and +// all values are valid label addresses. +// Otherwise jumps to arbitrary addresses could potentially happen. +// This is ensured as follows: +// Every index to the dispatch table gets masked using BYTECODE_MASK in +// DECODE(). This way we can only get values between 0 (only the least +// significant byte of an integer is used) and kRegExpPaddedBytecodeCount - 1 +// (BYTECODE_MASK is defined to be exactly this value). +// All entries from kRegExpBytecodeCount to kRegExpPaddedBytecodeCount have to +// be filled with BREAKs (invalid operation). + +// Fill dispatch table from last defined bytecode up to the next power of two +// with BREAK (invalid operation). +// TODO(pthier): Find a way to fill up automatically (at compile time) +// 59 real bytecodes -> 5 fillers +#define BYTECODE_FILLER_ITERATOR(V) \ + V(BREAK) /* 1 */ \ + V(BREAK) /* 2 */ \ + V(BREAK) /* 3 */ \ + V(BREAK) /* 4 */ \ + V(BREAK) /* 5 */ + +#define COUNT(...) +1 + static constexpr int kRegExpBytecodeFillerCount = + BYTECODE_FILLER_ITERATOR(COUNT); +#undef COUNT + + // Make sure kRegExpPaddedBytecodeCount is actually the closest possible power + // of two. + DCHECK_EQ(kRegExpPaddedBytecodeCount, + base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount)); + + // Make sure every bytecode we get by using BYTECODE_MASK is well defined. + static_assert(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount); + static_assert(kRegExpBytecodeCount + kRegExpBytecodeFillerCount == + kRegExpPaddedBytecodeCount); + +#define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name, + static const void* const dispatch_table[kRegExpPaddedBytecodeCount] = { + BYTECODE_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY) + BYTECODE_FILLER_ITERATOR(DECLARE_DISPATCH_TABLE_ENTRY)}; +#undef DECLARE_DISPATCH_TABLE_ENTRY +#undef BYTECODE_FILLER_ITERATOR + +#endif // V8_USE_COMPUTED_GOTO + + const byte* pc = code_array.GetDataStartAddress(); + const byte* code_base = pc; + + InterpreterRegisters registers(total_register_count, output_registers, + output_register_count); + BacktrackStack backtrack_stack; + + uint32_t backtrack_count = 0; + +#ifdef DEBUG + if (v8_flags.trace_regexp_bytecodes) { + PrintF("\n\nStart bytecode interpreter\n\n"); + } +#endif + + while (true) { + const byte* next_pc = pc; + int32_t insn; + int32_t next_insn; +#if V8_USE_COMPUTED_GOTO + const void* next_handler_addr; + DECODE(); + DISPATCH(); +#else + insn = Load32Aligned(pc); + switch (insn & BYTECODE_MASK) { +#endif // V8_USE_COMPUTED_GOTO + BYTECODE(BREAK) { UNREACHABLE(); } + BYTECODE(PUSH_CP) { + ADVANCE(PUSH_CP); + if (!backtrack_stack.push(current)) { + return MaybeThrowStackOverflow(isolate, call_origin); + } + DISPATCH(); + } + BYTECODE(PUSH_BT) { + ADVANCE(PUSH_BT); + if (!backtrack_stack.push(Load32Aligned(pc + 4))) { + return MaybeThrowStackOverflow(isolate, call_origin); + } + DISPATCH(); + } + BYTECODE(PUSH_REGISTER) { + ADVANCE(PUSH_REGISTER); + if (!backtrack_stack.push(registers[LoadPacked24Unsigned(insn)])) { + return MaybeThrowStackOverflow(isolate, call_origin); + } + DISPATCH(); + } + BYTECODE(SET_REGISTER) { + ADVANCE(SET_REGISTER); + registers[LoadPacked24Unsigned(insn)] = Load32Aligned(pc + 4); + DISPATCH(); + } + BYTECODE(ADVANCE_REGISTER) { + ADVANCE(ADVANCE_REGISTER); + registers[LoadPacked24Unsigned(insn)] += Load32Aligned(pc + 4); + DISPATCH(); + } + BYTECODE(SET_REGISTER_TO_CP) { + ADVANCE(SET_REGISTER_TO_CP); + registers[LoadPacked24Unsigned(insn)] = current + Load32Aligned(pc + 4); + DISPATCH(); + } + BYTECODE(SET_CP_TO_REGISTER) { + ADVANCE(SET_CP_TO_REGISTER); + SET_CURRENT_POSITION(registers[LoadPacked24Unsigned(insn)]); + DISPATCH(); + } + BYTECODE(SET_REGISTER_TO_SP) { + ADVANCE(SET_REGISTER_TO_SP); + registers[LoadPacked24Unsigned(insn)] = backtrack_stack.sp(); + DISPATCH(); + } + BYTECODE(SET_SP_TO_REGISTER) { + ADVANCE(SET_SP_TO_REGISTER); + backtrack_stack.set_sp(registers[LoadPacked24Unsigned(insn)]); + DISPATCH(); + } + BYTECODE(POP_CP) { + ADVANCE(POP_CP); + SET_CURRENT_POSITION(backtrack_stack.pop()); + DISPATCH(); + } + BYTECODE(POP_BT) { + static_assert(JSRegExp::kNoBacktrackLimit == 0); + if (++backtrack_count == backtrack_limit) { + int return_code = LoadPacked24Signed(insn); + return static_cast<IrregexpInterpreter::Result>(return_code); + } + + IrregexpInterpreter::Result return_code = + HandleInterrupts(isolate, call_origin, &code_array, &subject_string, + &code_base, &subject, &pc); + if (return_code != IrregexpInterpreter::SUCCESS) return return_code; + + SET_PC_FROM_OFFSET(backtrack_stack.pop()); + DISPATCH(); + } + BYTECODE(POP_REGISTER) { + ADVANCE(POP_REGISTER); + registers[LoadPacked24Unsigned(insn)] = backtrack_stack.pop(); + DISPATCH(); + } + BYTECODE(FAIL) { + isolate->counters()->regexp_backtracks()->AddSample( + static_cast<int>(backtrack_count)); + return IrregexpInterpreter::FAILURE; + } + BYTECODE(SUCCEED) { + isolate->counters()->regexp_backtracks()->AddSample( + static_cast<int>(backtrack_count)); + registers.CopyToOutputRegisters(); + return IrregexpInterpreter::SUCCESS; + } + BYTECODE(ADVANCE_CP) { + ADVANCE(ADVANCE_CP); + ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn)); + DISPATCH(); + } + BYTECODE(GOTO) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + BYTECODE(ADVANCE_CP_AND_GOTO) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn)); + DISPATCH(); + } + BYTECODE(CHECK_GREEDY) { + if (current == backtrack_stack.peek()) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + backtrack_stack.pop(); + } else { + ADVANCE(CHECK_GREEDY); + } + DISPATCH(); + } + BYTECODE(LOAD_CURRENT_CHAR) { + int pos = current + LoadPacked24Signed(insn); + if (pos >= subject.length() || pos < 0) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(LOAD_CURRENT_CHAR); + current_char = subject[pos]; + } + DISPATCH(); + } + BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { + ADVANCE(LOAD_CURRENT_CHAR_UNCHECKED); + int pos = current + LoadPacked24Signed(insn); + current_char = subject[pos]; + DISPATCH(); + } + BYTECODE(LOAD_2_CURRENT_CHARS) { + int pos = current + LoadPacked24Signed(insn); + if (pos + 2 > subject.length() || pos < 0) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(LOAD_2_CURRENT_CHARS); + Char next = subject[pos + 1]; + current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); + } + DISPATCH(); + } + BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { + ADVANCE(LOAD_2_CURRENT_CHARS_UNCHECKED); + int pos = current + LoadPacked24Signed(insn); + Char next = subject[pos + 1]; + current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); + DISPATCH(); + } + BYTECODE(LOAD_4_CURRENT_CHARS) { + DCHECK_EQ(1, sizeof(Char)); + int pos = current + LoadPacked24Signed(insn); + if (pos + 4 > subject.length() || pos < 0) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(LOAD_4_CURRENT_CHARS); + Char next1 = subject[pos + 1]; + Char next2 = subject[pos + 2]; + Char next3 = subject[pos + 3]; + current_char = + (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24)); + } + DISPATCH(); + } + BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) { + ADVANCE(LOAD_4_CURRENT_CHARS_UNCHECKED); + DCHECK_EQ(1, sizeof(Char)); + int pos = current + LoadPacked24Signed(insn); + Char next1 = subject[pos + 1]; + Char next2 = subject[pos + 2]; + Char next3 = subject[pos + 3]; + current_char = + (subject[pos] | (next1 << 8) | (next2 << 16) | (next3 << 24)); + DISPATCH(); + } + BYTECODE(CHECK_4_CHARS) { + uint32_t c = Load32Aligned(pc + 4); + if (c == current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(CHECK_4_CHARS); + } + DISPATCH(); + } + BYTECODE(CHECK_CHAR) { + uint32_t c = LoadPacked24Unsigned(insn); + if (c == current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_CHAR); + } + DISPATCH(); + } + BYTECODE(CHECK_NOT_4_CHARS) { + uint32_t c = Load32Aligned(pc + 4); + if (c != current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(CHECK_NOT_4_CHARS); + } + DISPATCH(); + } + BYTECODE(CHECK_NOT_CHAR) { + uint32_t c = LoadPacked24Unsigned(insn); + if (c != current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_NOT_CHAR); + } + DISPATCH(); + } + BYTECODE(AND_CHECK_4_CHARS) { + uint32_t c = Load32Aligned(pc + 4); + if (c == (current_char & Load32Aligned(pc + 8))) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); + } else { + ADVANCE(AND_CHECK_4_CHARS); + } + DISPATCH(); + } + BYTECODE(AND_CHECK_CHAR) { + uint32_t c = LoadPacked24Unsigned(insn); + if (c == (current_char & Load32Aligned(pc + 4))) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(AND_CHECK_CHAR); + } + DISPATCH(); + } + BYTECODE(AND_CHECK_NOT_4_CHARS) { + uint32_t c = Load32Aligned(pc + 4); + if (c != (current_char & Load32Aligned(pc + 8))) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); + } else { + ADVANCE(AND_CHECK_NOT_4_CHARS); + } + DISPATCH(); + } + BYTECODE(AND_CHECK_NOT_CHAR) { + uint32_t c = LoadPacked24Unsigned(insn); + if (c != (current_char & Load32Aligned(pc + 4))) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(AND_CHECK_NOT_CHAR); + } + DISPATCH(); + } + BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { + uint32_t c = LoadPacked24Unsigned(insn); + uint32_t minus = Load16Aligned(pc + 4); + uint32_t mask = Load16Aligned(pc + 6); + if (c != ((current_char - minus) & mask)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(MINUS_AND_CHECK_NOT_CHAR); + } + DISPATCH(); + } + BYTECODE(CHECK_CHAR_IN_RANGE) { + uint32_t from = Load16Aligned(pc + 4); + uint32_t to = Load16Aligned(pc + 6); + if (from <= current_char && current_char <= to) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(CHECK_CHAR_IN_RANGE); + } + DISPATCH(); + } + BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { + uint32_t from = Load16Aligned(pc + 4); + uint32_t to = Load16Aligned(pc + 6); + if (from > current_char || current_char > to) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(CHECK_CHAR_NOT_IN_RANGE); + } + DISPATCH(); + } + BYTECODE(CHECK_BIT_IN_TABLE) { + if (CheckBitInTable(current_char, pc + 8)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_BIT_IN_TABLE); + } + DISPATCH(); + } + BYTECODE(CHECK_LT) { + uint32_t limit = LoadPacked24Unsigned(insn); + if (current_char < limit) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_LT); + } + DISPATCH(); + } + BYTECODE(CHECK_GT) { + uint32_t limit = LoadPacked24Unsigned(insn); + if (current_char > limit) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_GT); + } + DISPATCH(); + } + BYTECODE(CHECK_REGISTER_LT) { + if (registers[LoadPacked24Unsigned(insn)] < Load32Aligned(pc + 4)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(CHECK_REGISTER_LT); + } + DISPATCH(); + } + BYTECODE(CHECK_REGISTER_GE) { + if (registers[LoadPacked24Unsigned(insn)] >= Load32Aligned(pc + 4)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } else { + ADVANCE(CHECK_REGISTER_GE); + } + DISPATCH(); + } + BYTECODE(CHECK_REGISTER_EQ_POS) { + if (registers[LoadPacked24Unsigned(insn)] == current) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_REGISTER_EQ_POS); + } + DISPATCH(); + } + BYTECODE(CHECK_NOT_REGS_EQUAL) { + if (registers[LoadPacked24Unsigned(insn)] == + registers[Load32Aligned(pc + 4)]) { + ADVANCE(CHECK_NOT_REGS_EQUAL); + } else { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + } + DISPATCH(); + } + BYTECODE(CHECK_NOT_BACK_REF) { + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current + len > subject.length() || + !CompareCharsEqual(&subject[from], &subject[current], len)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(len); + } + ADVANCE(CHECK_NOT_BACK_REF); + DISPATCH(); + } + BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) { + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current - len < 0 || + !CompareCharsEqual(&subject[from], &subject[current - len], len)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + SET_CURRENT_POSITION(current - len); + } + ADVANCE(CHECK_NOT_BACK_REF_BACKWARD); + DISPATCH(); + } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current + len > subject.length() || + !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(len); + } + ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE); + DISPATCH(); + } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current + len > subject.length() || + !BackRefMatchesNoCase(isolate, from, current, len, subject, + false)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(len); + } + ADVANCE(CHECK_NOT_BACK_REF_NO_CASE); + DISPATCH(); + } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) { + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current - len < 0 || + !BackRefMatchesNoCase(isolate, from, current - len, len, subject, + true)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + SET_CURRENT_POSITION(current - len); + } + ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD); + DISPATCH(); + } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current - len < 0 || + !BackRefMatchesNoCase(isolate, from, current - len, len, subject, + false)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + SET_CURRENT_POSITION(current - len); + } + ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD); + DISPATCH(); + } + BYTECODE(CHECK_AT_START) { + if (current + LoadPacked24Signed(insn) == 0) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_AT_START); + } + DISPATCH(); + } + BYTECODE(CHECK_NOT_AT_START) { + if (current + LoadPacked24Signed(insn) == 0) { + ADVANCE(CHECK_NOT_AT_START); + } else { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } + DISPATCH(); + } + BYTECODE(SET_CURRENT_POSITION_FROM_END) { + ADVANCE(SET_CURRENT_POSITION_FROM_END); + int by = LoadPacked24Unsigned(insn); + if (subject.length() - current > by) { + SET_CURRENT_POSITION(subject.length() - by); + current_char = subject[current - 1]; + } + DISPATCH(); + } + BYTECODE(CHECK_CURRENT_POSITION) { + int pos = current + LoadPacked24Signed(insn); + if (pos > subject.length() || pos < 0) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + } else { + ADVANCE(CHECK_CURRENT_POSITION); + } + DISPATCH(); + } + BYTECODE(SKIP_UNTIL_CHAR) { + int32_t load_offset = LoadPacked24Signed(insn); + int32_t advance = Load16AlignedSigned(pc + 4); + uint32_t c = Load16Aligned(pc + 6); + while (IndexIsInBounds(current + load_offset, subject.length())) { + current_char = subject[current + load_offset]; + if (c == current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(advance); + } + SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); + DISPATCH(); + } + BYTECODE(SKIP_UNTIL_CHAR_AND) { + int32_t load_offset = LoadPacked24Signed(insn); + int32_t advance = Load16AlignedSigned(pc + 4); + uint16_t c = Load16Aligned(pc + 6); + uint32_t mask = Load32Aligned(pc + 8); + int32_t maximum_offset = Load32Aligned(pc + 12); + while (static_cast<uintptr_t>(current + maximum_offset) <= + static_cast<uintptr_t>(subject.length())) { + current_char = subject[current + load_offset]; + if (c == (current_char & mask)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(advance); + } + SET_PC_FROM_OFFSET(Load32Aligned(pc + 20)); + DISPATCH(); + } + BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) { + int32_t load_offset = LoadPacked24Signed(insn); + int32_t advance = Load16AlignedSigned(pc + 4); + uint16_t c = Load16Aligned(pc + 6); + int32_t maximum_offset = Load32Aligned(pc + 8); + while (static_cast<uintptr_t>(current + maximum_offset) <= + static_cast<uintptr_t>(subject.length())) { + current_char = subject[current + load_offset]; + if (c == current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(advance); + } + SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); + DISPATCH(); + } + BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) { + int32_t load_offset = LoadPacked24Signed(insn); + int32_t advance = Load16AlignedSigned(pc + 4); + const byte* table = pc + 8; + while (IndexIsInBounds(current + load_offset, subject.length())) { + current_char = subject[current + load_offset]; + if (CheckBitInTable(current_char, table)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(advance); + } + SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); + DISPATCH(); + } + BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) { + int32_t load_offset = LoadPacked24Signed(insn); + int32_t advance = Load16AlignedSigned(pc + 4); + uint16_t limit = Load16Aligned(pc + 6); + const byte* table = pc + 8; + while (IndexIsInBounds(current + load_offset, subject.length())) { + current_char = subject[current + load_offset]; + if (current_char > limit) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); + DISPATCH(); + } + if (!CheckBitInTable(current_char, table)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(advance); + } + SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); + DISPATCH(); + } + BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) { + int32_t load_offset = LoadPacked24Signed(insn); + int32_t advance = Load32Aligned(pc + 4); + uint16_t c = Load16Aligned(pc + 8); + uint16_t c2 = Load16Aligned(pc + 10); + while (IndexIsInBounds(current + load_offset, subject.length())) { + current_char = subject[current + load_offset]; + // The two if-statements below are split up intentionally, as combining + // them seems to result in register allocation behaving quite + // differently and slowing down the resulting code. + if (c == current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); + DISPATCH(); + } + if (c2 == current_char) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(advance); + } + SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); + DISPATCH(); + } +#if V8_USE_COMPUTED_GOTO +// Lint gets confused a lot if we just use !V8_USE_COMPUTED_GOTO or ifndef +// V8_USE_COMPUTED_GOTO here. +#else + default: + UNREACHABLE(); + } + // Label we jump to in DISPATCH(). There must be no instructions between the + // end of the switch, this label and the end of the loop. + switch_dispatch_continuation : {} +#endif // V8_USE_COMPUTED_GOTO + } +} + +#undef BYTECODE +#undef ADVANCE_CURRENT_POSITION +#undef SET_CURRENT_POSITION +#undef DISPATCH +#undef DECODE +#undef SET_PC_FROM_OFFSET +#undef ADVANCE +#undef BC_LABEL +#undef V8_USE_COMPUTED_GOTO + +} // namespace + +// static +IrregexpInterpreter::Result IrregexpInterpreter::Match( + Isolate* isolate, JSRegExp regexp, String subject_string, + int* output_registers, int output_register_count, int start_position, + RegExp::CallOrigin call_origin) { + if (v8_flags.regexp_tier_up) regexp.TierUpTick(); + + bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string); + ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte)); + int total_register_count = regexp.max_register_count(); + + return MatchInternal(isolate, code_array, subject_string, output_registers, + output_register_count, total_register_count, + start_position, call_origin, regexp.backtrack_limit()); +} + +IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( + Isolate* isolate, ByteArray code_array, String subject_string, + int* output_registers, int output_register_count, int total_register_count, + int start_position, RegExp::CallOrigin call_origin, + uint32_t backtrack_limit) { + DCHECK(subject_string.IsFlat()); + + // TODO(chromium:1262676): Remove this CHECK once fixed. + CHECK(code_array.IsByteArray()); + + // Note: Heap allocation *is* allowed in two situations if calling from + // Runtime: + // 1. When creating & throwing a stack overflow exception. The interpreter + // aborts afterwards, and thus possible-moved objects are never used. + // 2. When handling interrupts. We manually relocate unhandlified references + // after interrupts have run. + DisallowGarbageCollection no_gc; + + base::uc16 previous_char = '\n'; + String::FlatContent subject_content = subject_string.GetFlatContent(no_gc); + // Because interrupts can result in GC and string content relocation, the + // checksum verification in FlatContent may fail even though this code is + // safe. See (2) above. + subject_content.UnsafeDisableChecksumVerification(); + if (subject_content.IsOneByte()) { + base::Vector<const uint8_t> subject_vector = + subject_content.ToOneByteVector(); + if (start_position != 0) previous_char = subject_vector[start_position - 1]; + return RawMatch(isolate, code_array, subject_string, subject_vector, + output_registers, output_register_count, + total_register_count, start_position, previous_char, + call_origin, backtrack_limit); + } else { + DCHECK(subject_content.IsTwoByte()); + base::Vector<const base::uc16> subject_vector = + subject_content.ToUC16Vector(); + if (start_position != 0) previous_char = subject_vector[start_position - 1]; + return RawMatch(isolate, code_array, subject_string, subject_vector, + output_registers, output_register_count, + total_register_count, start_position, previous_char, + call_origin, backtrack_limit); + } +} + +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +// This method is called through an external reference from RegExpExecInternal +// builtin. +IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( + Address subject, int32_t start_position, Address, Address, + int* output_registers, int32_t output_register_count, + RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp) { + DCHECK_NOT_NULL(isolate); + DCHECK_NOT_NULL(output_registers); + DCHECK(call_origin == RegExp::CallOrigin::kFromJs); + + DisallowGarbageCollection no_gc; + DisallowJavascriptExecution no_js(isolate); + DisallowHandleAllocation no_handles; + DisallowHandleDereference no_deref; + + String subject_string = String::cast(Object(subject)); + JSRegExp regexp_obj = JSRegExp::cast(Object(regexp)); + + if (regexp_obj.MarkedForTierUp()) { + // Returning RETRY will re-enter through runtime, where actual recompilation + // for tier-up takes place. + return IrregexpInterpreter::RETRY; + } + + return Match(isolate, regexp_obj, subject_string, output_registers, + output_register_count, start_position, call_origin); +} + +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime( + Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string, + int* output_registers, int output_register_count, int start_position) { + return Match(isolate, *regexp, *subject_string, output_registers, + output_register_count, start_position, + RegExp::CallOrigin::kFromRuntime); +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-interpreter.h b/js/src/irregexp/imported/regexp-interpreter.h new file mode 100644 index 0000000000..bc55be2b8c --- /dev/null +++ b/js/src/irregexp/imported/regexp-interpreter.h @@ -0,0 +1,68 @@ +// Copyright 2011 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// A simple interpreter for the Irregexp byte code. + +#ifndef V8_REGEXP_REGEXP_INTERPRETER_H_ +#define V8_REGEXP_REGEXP_INTERPRETER_H_ + +#include "irregexp/imported/regexp.h" + +namespace v8 { +namespace internal { + +class ByteArray; + +class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { + public: + enum Result { + FAILURE = RegExp::kInternalRegExpFailure, + SUCCESS = RegExp::kInternalRegExpSuccess, + EXCEPTION = RegExp::kInternalRegExpException, + RETRY = RegExp::kInternalRegExpRetry, + FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental, + }; + + // In case a StackOverflow occurs, a StackOverflowException is created and + // EXCEPTION is returned. + static Result MatchForCallFromRuntime( + Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string, + int* output_registers, int output_register_count, int start_position); + + // In case a StackOverflow occurs, EXCEPTION is returned. The caller is + // responsible for creating the exception. + // + // RETRY is returned if a retry through the runtime is needed (e.g. when + // interrupts have been scheduled or the regexp is marked for tier-up). + // + // Arguments input_start and input_end are unused. They are only passed to + // match the signature of the native irregex code. + // + // Arguments output_registers and output_register_count describe the results + // array, which will contain register values of all captures if SUCCESS is + // returned. For all other return codes, the results array remains unmodified. + static Result MatchForCallFromJs(Address subject, int32_t start_position, + Address input_start, Address input_end, + int* output_registers, + int32_t output_register_count, + RegExp::CallOrigin call_origin, + Isolate* isolate, Address regexp); + + static Result MatchInternal(Isolate* isolate, ByteArray code_array, + String subject_string, int* output_registers, + int output_register_count, + int total_register_count, int start_position, + RegExp::CallOrigin call_origin, + uint32_t backtrack_limit); + + private: + static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string, + int* output_registers, int output_register_count, + int start_position, RegExp::CallOrigin call_origin); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_INTERPRETER_H_ diff --git a/js/src/irregexp/imported/regexp-macro-assembler-arch.h b/js/src/irregexp/imported/regexp-macro-assembler-arch.h new file mode 100644 index 0000000000..a755e7c1b3 --- /dev/null +++ b/js/src/irregexp/imported/regexp-macro-assembler-arch.h @@ -0,0 +1,7 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "irregexp/RegExpNativeMacroAssembler.h" diff --git a/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc b/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc new file mode 100644 index 0000000000..6444ca3c60 --- /dev/null +++ b/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc @@ -0,0 +1,438 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-macro-assembler-tracer.h" + + +namespace v8 { +namespace internal { + +RegExpMacroAssemblerTracer::RegExpMacroAssemblerTracer( + Isolate* isolate, RegExpMacroAssembler* assembler) + : RegExpMacroAssembler(isolate, assembler->zone()), assembler_(assembler) { + PrintF("RegExpMacroAssembler%s();\n", + ImplementationToString(assembler->Implementation())); +} + +RegExpMacroAssemblerTracer::~RegExpMacroAssemblerTracer() = default; + +void RegExpMacroAssemblerTracer::AbortedCodeGeneration() { + PrintF(" AbortedCodeGeneration\n"); + assembler_->AbortedCodeGeneration(); +} + + +// This is used for printing out debugging information. It makes an integer +// that is closely related to the address of an object. +static int LabelToInt(Label* label) { + return static_cast<int>(reinterpret_cast<intptr_t>(label)); +} + + +void RegExpMacroAssemblerTracer::Bind(Label* label) { + PrintF("label[%08x]: (Bind)\n", LabelToInt(label)); + assembler_->Bind(label); +} + + +void RegExpMacroAssemblerTracer::AdvanceCurrentPosition(int by) { + PrintF(" AdvanceCurrentPosition(by=%d);\n", by); + assembler_->AdvanceCurrentPosition(by); +} + + +void RegExpMacroAssemblerTracer::CheckGreedyLoop(Label* label) { + PrintF(" CheckGreedyLoop(label[%08x]);\n\n", LabelToInt(label)); + assembler_->CheckGreedyLoop(label); +} + + +void RegExpMacroAssemblerTracer::PopCurrentPosition() { + PrintF(" PopCurrentPosition();\n"); + assembler_->PopCurrentPosition(); +} + + +void RegExpMacroAssemblerTracer::PushCurrentPosition() { + PrintF(" PushCurrentPosition();\n"); + assembler_->PushCurrentPosition(); +} + + +void RegExpMacroAssemblerTracer::Backtrack() { + PrintF(" Backtrack();\n"); + assembler_->Backtrack(); +} + + +void RegExpMacroAssemblerTracer::GoTo(Label* label) { + PrintF(" GoTo(label[%08x]);\n\n", LabelToInt(label)); + assembler_->GoTo(label); +} + + +void RegExpMacroAssemblerTracer::PushBacktrack(Label* label) { + PrintF(" PushBacktrack(label[%08x]);\n", LabelToInt(label)); + assembler_->PushBacktrack(label); +} + + +bool RegExpMacroAssemblerTracer::Succeed() { + bool restart = assembler_->Succeed(); + PrintF(" Succeed();%s\n", restart ? " [restart for global match]" : ""); + return restart; +} + + +void RegExpMacroAssemblerTracer::Fail() { + PrintF(" Fail();"); + assembler_->Fail(); +} + + +void RegExpMacroAssemblerTracer::PopRegister(int register_index) { + PrintF(" PopRegister(register=%d);\n", register_index); + assembler_->PopRegister(register_index); +} + + +void RegExpMacroAssemblerTracer::PushRegister( + int register_index, + StackCheckFlag check_stack_limit) { + PrintF(" PushRegister(register=%d, %s);\n", + register_index, + check_stack_limit ? "check stack limit" : ""); + assembler_->PushRegister(register_index, check_stack_limit); +} + + +void RegExpMacroAssemblerTracer::AdvanceRegister(int reg, int by) { + PrintF(" AdvanceRegister(register=%d, by=%d);\n", reg, by); + assembler_->AdvanceRegister(reg, by); +} + + +void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) { + PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by); + assembler_->SetCurrentPositionFromEnd(by); +} + + +void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) { + PrintF(" SetRegister(register=%d, to=%d);\n", register_index, to); + assembler_->SetRegister(register_index, to); +} + + +void RegExpMacroAssemblerTracer::WriteCurrentPositionToRegister(int reg, + int cp_offset) { + PrintF(" WriteCurrentPositionToRegister(register=%d,cp_offset=%d);\n", + reg, + cp_offset); + assembler_->WriteCurrentPositionToRegister(reg, cp_offset); +} + + +void RegExpMacroAssemblerTracer::ClearRegisters(int reg_from, int reg_to) { + PrintF(" ClearRegister(from=%d, to=%d);\n", reg_from, reg_to); + assembler_->ClearRegisters(reg_from, reg_to); +} + + +void RegExpMacroAssemblerTracer::ReadCurrentPositionFromRegister(int reg) { + PrintF(" ReadCurrentPositionFromRegister(register=%d);\n", reg); + assembler_->ReadCurrentPositionFromRegister(reg); +} + + +void RegExpMacroAssemblerTracer::WriteStackPointerToRegister(int reg) { + PrintF(" WriteStackPointerToRegister(register=%d);\n", reg); + assembler_->WriteStackPointerToRegister(reg); +} + + +void RegExpMacroAssemblerTracer::ReadStackPointerFromRegister(int reg) { + PrintF(" ReadStackPointerFromRegister(register=%d);\n", reg); + assembler_->ReadStackPointerFromRegister(reg); +} + +void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl( + int cp_offset, Label* on_end_of_input, bool check_bounds, int characters, + int eats_at_least) { + const char* check_msg = check_bounds ? "" : " (unchecked)"; + PrintF( + " LoadCurrentCharacter(cp_offset=%d, label[%08x]%s (%d chars) (eats at " + "least %d));\n", + cp_offset, LabelToInt(on_end_of_input), check_msg, characters, + eats_at_least); + assembler_->LoadCurrentCharacter(cp_offset, on_end_of_input, check_bounds, + characters, eats_at_least); +} + +namespace { + +class PrintablePrinter { + public: + explicit PrintablePrinter(base::uc16 character) : character_(character) {} + + const char* operator*() { + if (character_ >= ' ' && character_ <= '~') { + buffer_[0] = '('; + buffer_[1] = static_cast<char>(character_); + buffer_[2] = ')'; + buffer_[3] = '\0'; + } else { + buffer_[0] = '\0'; + } + return &buffer_[0]; + } + + private: + base::uc16 character_; + char buffer_[4]; +}; + +} // namespace + +void RegExpMacroAssemblerTracer::CheckCharacterLT(base::uc16 limit, + Label* on_less) { + PrintablePrinter printable(limit); + PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n", + limit, + *printable, + LabelToInt(on_less)); + assembler_->CheckCharacterLT(limit, on_less); +} + +void RegExpMacroAssemblerTracer::CheckCharacterGT(base::uc16 limit, + Label* on_greater) { + PrintablePrinter printable(limit); + PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n", + limit, + *printable, + LabelToInt(on_greater)); + assembler_->CheckCharacterGT(limit, on_greater); +} + +void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) { + PrintablePrinter printable(c); + PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n", + c, + *printable, + LabelToInt(on_equal)); + assembler_->CheckCharacter(c, on_equal); +} + +void RegExpMacroAssemblerTracer::CheckAtStart(int cp_offset, + Label* on_at_start) { + PrintF(" CheckAtStart(cp_offset=%d, label[%08x]);\n", cp_offset, + LabelToInt(on_at_start)); + assembler_->CheckAtStart(cp_offset, on_at_start); +} + +void RegExpMacroAssemblerTracer::CheckNotAtStart(int cp_offset, + Label* on_not_at_start) { + PrintF(" CheckNotAtStart(cp_offset=%d, label[%08x]);\n", cp_offset, + LabelToInt(on_not_at_start)); + assembler_->CheckNotAtStart(cp_offset, on_not_at_start); +} + + +void RegExpMacroAssemblerTracer::CheckNotCharacter(unsigned c, + Label* on_not_equal) { + PrintablePrinter printable(c); + PrintF(" CheckNotCharacter(c=0x%04x%s, label[%08x]);\n", + c, + *printable, + LabelToInt(on_not_equal)); + assembler_->CheckNotCharacter(c, on_not_equal); +} + + +void RegExpMacroAssemblerTracer::CheckCharacterAfterAnd( + unsigned c, + unsigned mask, + Label* on_equal) { + PrintablePrinter printable(c); + PrintF(" CheckCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n", + c, + *printable, + mask, + LabelToInt(on_equal)); + assembler_->CheckCharacterAfterAnd(c, mask, on_equal); +} + + +void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd( + unsigned c, + unsigned mask, + Label* on_not_equal) { + PrintablePrinter printable(c); + PrintF(" CheckNotCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n", + c, + *printable, + mask, + LabelToInt(on_not_equal)); + assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal); +} + +void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd( + base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) { + PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, " + "label[%08x]);\n", + c, + minus, + mask, + LabelToInt(on_not_equal)); + assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal); +} + +void RegExpMacroAssemblerTracer::CheckCharacterInRange(base::uc16 from, + base::uc16 to, + Label* on_not_in_range) { + PrintablePrinter printable_from(from); + PrintablePrinter printable_to(to); + PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n", + from, + *printable_from, + to, + *printable_to, + LabelToInt(on_not_in_range)); + assembler_->CheckCharacterInRange(from, to, on_not_in_range); +} + +void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, + Label* on_in_range) { + PrintablePrinter printable_from(from); + PrintablePrinter printable_to(to); + PrintF( + " CheckCharacterNotInRange(from=0x%04x%s," " to=%04x%s, label[%08x]);\n", + from, + *printable_from, + to, + *printable_to, + LabelToInt(on_in_range)); + assembler_->CheckCharacterNotInRange(from, to, on_in_range); +} + +namespace { + +void PrintRangeArray(const ZoneList<CharacterRange>* ranges) { + for (int i = 0; i < ranges->length(); i++) { + base::uc16 from = ranges->at(i).from(); + base::uc16 to = ranges->at(i).to(); + PrintablePrinter printable_from(from); + PrintablePrinter printable_to(to); + PrintF(" [from=0x%04x%s, to=%04x%s],\n", from, *printable_from, to, + *printable_to); + } +} + +} // namespace + +bool RegExpMacroAssemblerTracer::CheckCharacterInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_in_range) { + PrintF( + " CheckCharacterInRangeArray(\n" + " label[%08x]);\n", + LabelToInt(on_in_range)); + PrintRangeArray(ranges); + return assembler_->CheckCharacterInRangeArray(ranges, on_in_range); +} + +bool RegExpMacroAssemblerTracer::CheckCharacterNotInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) { + PrintF( + " CheckCharacterNotInRangeArray(\n" + " label[%08x]);\n", + LabelToInt(on_not_in_range)); + PrintRangeArray(ranges); + return assembler_->CheckCharacterNotInRangeArray(ranges, on_not_in_range); +} + +void RegExpMacroAssemblerTracer::CheckBitInTable( + Handle<ByteArray> table, Label* on_bit_set) { + PrintF(" CheckBitInTable(label[%08x] ", LabelToInt(on_bit_set)); + for (int i = 0; i < kTableSize; i++) { + PrintF("%c", table->get(i) != 0 ? 'X' : '.'); + if (i % 32 == 31 && i != kTableMask) { + PrintF("\n "); + } + } + PrintF(");\n"); + assembler_->CheckBitInTable(table, on_bit_set); +} + + +void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg, + bool read_backward, + Label* on_no_match) { + PrintF(" CheckNotBackReference(register=%d, %s, label[%08x]);\n", start_reg, + read_backward ? "backward" : "forward", LabelToInt(on_no_match)); + assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match); +} + +void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase( + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n", + start_reg, read_backward ? "backward" : "forward", + unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match)); + assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode, + on_no_match); +} + +void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset, + Label* on_outside_input) { + PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset, + LabelToInt(on_outside_input)); + assembler_->CheckPosition(cp_offset, on_outside_input); +} + +bool RegExpMacroAssemblerTracer::CheckSpecialClassRanges( + StandardCharacterSet type, Label* on_no_match) { + bool supported = assembler_->CheckSpecialClassRanges(type, on_no_match); + PrintF(" CheckSpecialClassRanges(type='%c', label[%08x]): %s;\n", + static_cast<char>(type), LabelToInt(on_no_match), + supported ? "true" : "false"); + return supported; +} + +void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index, + int comparand, Label* if_lt) { + PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n", + register_index, comparand, LabelToInt(if_lt)); + assembler_->IfRegisterLT(register_index, comparand, if_lt); +} + + +void RegExpMacroAssemblerTracer::IfRegisterEqPos(int register_index, + Label* if_eq) { + PrintF(" IfRegisterEqPos(register=%d, label[%08x]);\n", + register_index, LabelToInt(if_eq)); + assembler_->IfRegisterEqPos(register_index, if_eq); +} + + +void RegExpMacroAssemblerTracer::IfRegisterGE(int register_index, + int comparand, Label* if_ge) { + PrintF(" IfRegisterGE(register=%d, number=%d, label[%08x]);\n", + register_index, comparand, LabelToInt(if_ge)); + assembler_->IfRegisterGE(register_index, comparand, if_ge); +} + + +RegExpMacroAssembler::IrregexpImplementation + RegExpMacroAssemblerTracer::Implementation() { + return assembler_->Implementation(); +} + + +Handle<HeapObject> RegExpMacroAssemblerTracer::GetCode(Handle<String> source) { + PrintF(" GetCode(%s);\n", source->ToCString().get()); + return assembler_->GetCode(source); +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-macro-assembler-tracer.h b/js/src/irregexp/imported/regexp-macro-assembler-tracer.h new file mode 100644 index 0000000000..3fadf1a893 --- /dev/null +++ b/js/src/irregexp/imported/regexp-macro-assembler-tracer.h @@ -0,0 +1,90 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_ +#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_ + +#include "irregexp/imported/regexp-macro-assembler.h" + +namespace v8 { +namespace internal { + +// Decorator on a RegExpMacroAssembler that write all calls. +class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { + public: + RegExpMacroAssemblerTracer(Isolate* isolate, RegExpMacroAssembler* assembler); + ~RegExpMacroAssemblerTracer() override; + void AbortedCodeGeneration() override; + int stack_limit_slack() override { return assembler_->stack_limit_slack(); } + bool CanReadUnaligned() const override { + return assembler_->CanReadUnaligned(); + } + void AdvanceCurrentPosition(int by) override; // Signed cp change. + void AdvanceRegister(int reg, int by) override; // r[reg] += by. + void Backtrack() override; + void Bind(Label* label) override; + void CheckCharacter(unsigned c, Label* on_equal) override; + void CheckCharacterAfterAnd(unsigned c, unsigned and_with, + Label* on_equal) override; + void CheckCharacterGT(base::uc16 limit, Label* on_greater) override; + void CheckCharacterLT(base::uc16 limit, Label* on_less) override; + void CheckGreedyLoop(Label* on_tos_equals_current_position) override; + void CheckAtStart(int cp_offset, Label* on_at_start) override; + void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override; + void CheckNotBackReference(int start_reg, bool read_backward, + Label* on_no_match) override; + void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + bool unicode, + Label* on_no_match) override; + void CheckNotCharacter(unsigned c, Label* on_not_equal) override; + void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, + Label* on_not_equal) override; + void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 and_with, + Label* on_not_equal) override; + void CheckCharacterInRange(base::uc16 from, base::uc16 to, + Label* on_in_range) override; + void CheckCharacterNotInRange(base::uc16 from, base::uc16 to, + Label* on_not_in_range) override; + bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges, + Label* on_in_range) override; + bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges, + Label* on_not_in_range) override; + void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override; + void CheckPosition(int cp_offset, Label* on_outside_input) override; + bool CheckSpecialClassRanges(StandardCharacterSet type, + Label* on_no_match) override; + void Fail() override; + Handle<HeapObject> GetCode(Handle<String> source) override; + void GoTo(Label* label) override; + void IfRegisterGE(int reg, int comparand, Label* if_ge) override; + void IfRegisterLT(int reg, int comparand, Label* if_lt) override; + void IfRegisterEqPos(int reg, Label* if_eq) override; + IrregexpImplementation Implementation() override; + void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, + bool check_bounds, int characters, + int eats_at_least) override; + void PopCurrentPosition() override; + void PopRegister(int register_index) override; + void PushBacktrack(Label* label) override; + void PushCurrentPosition() override; + void PushRegister(int register_index, + StackCheckFlag check_stack_limit) override; + void ReadCurrentPositionFromRegister(int reg) override; + void ReadStackPointerFromRegister(int reg) override; + void SetCurrentPositionFromEnd(int by) override; + void SetRegister(int register_index, int to) override; + bool Succeed() override; + void WriteCurrentPositionToRegister(int reg, int cp_offset) override; + void ClearRegisters(int reg_from, int reg_to) override; + void WriteStackPointerToRegister(int reg) override; + + private: + RegExpMacroAssembler* assembler_; +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_ diff --git a/js/src/irregexp/imported/regexp-macro-assembler.cc b/js/src/irregexp/imported/regexp-macro-assembler.cc new file mode 100644 index 0000000000..0592338229 --- /dev/null +++ b/js/src/irregexp/imported/regexp-macro-assembler.cc @@ -0,0 +1,520 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-macro-assembler.h" + +#include "irregexp/imported/regexp-stack.h" +#include "irregexp/imported/special-case.h" + +#ifdef V8_INTL_SUPPORT +#include "unicode/uchar.h" +#include "unicode/unistr.h" +#endif // V8_INTL_SUPPORT + +namespace v8 { +namespace internal { + +RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) + : slow_safe_compiler_(false), + backtrack_limit_(JSRegExp::kNoBacktrackLimit), + global_mode_(NOT_GLOBAL), + isolate_(isolate), + zone_(zone) {} + +bool RegExpMacroAssembler::has_backtrack_limit() const { + return backtrack_limit_ != JSRegExp::kNoBacktrackLimit; +} + +// static +int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate) { +#ifdef V8_INTL_SUPPORT + // This function is not allowed to cause a garbage collection. + // A GC might move the calling generated code and invalidate the + // return address on the stack. + DisallowGarbageCollection no_gc; + DCHECK_EQ(0, byte_length % 2); + size_t length = byte_length / 2; + base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1); + base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2); + + for (size_t i = 0; i < length; i++) { + UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]); + UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]); + if (c1 != c2) { + return 0; + } + } + return 1; +#else + return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length, + isolate); +#endif +} + +// static +int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate) { + // This function is not allowed to cause a garbage collection. + // A GC might move the calling generated code and invalidate the + // return address on the stack. + DisallowGarbageCollection no_gc; + DCHECK_EQ(0, byte_length % 2); + +#ifdef V8_INTL_SUPPORT + int32_t length = static_cast<int32_t>(byte_length >> 1); + icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1), + length); + return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2), + length, U_FOLD_CASE_DEFAULT) == 0; +#else + base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1); + base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2); + size_t length = byte_length >> 1; + DCHECK_NOT_NULL(isolate); + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = + isolate->regexp_macro_assembler_canonicalize(); + for (size_t i = 0; i < length; i++) { + unibrow::uchar c1 = substring1[i]; + unibrow::uchar c2 = substring2[i]; + if (c1 != c2) { + unibrow::uchar s1[1] = {c1}; + canonicalize->get(c1, '\0', s1); + if (s1[0] != c2) { + unibrow::uchar s2[1] = {c2}; + canonicalize->get(c2, '\0', s2); + if (s1[0] != s2[0]) { + return 0; + } + } + } + } + return 1; +#endif // V8_INTL_SUPPORT +} + +namespace { + +uint32_t Hash(const ZoneList<CharacterRange>* ranges) { + size_t seed = 0; + for (int i = 0; i < ranges->length(); i++) { + const CharacterRange& r = ranges->at(i); + seed = base::hash_combine(seed, r.from(), r.to()); + } + return static_cast<uint32_t>(seed); +} + +constexpr base::uc32 MaskEndOfRangeMarker(base::uc32 c) { + // CharacterRanges may use 0x10ffff as the end-of-range marker irrespective + // of whether the regexp IsUnicode or not; translate the marker value here. + DCHECK_IMPLIES(c > kMaxUInt16, c == String::kMaxCodePoint); + return c & 0xffff; +} + +int RangeArrayLengthFor(const ZoneList<CharacterRange>* ranges) { + const int ranges_length = ranges->length(); + return MaskEndOfRangeMarker(ranges->at(ranges_length - 1).to()) == kMaxUInt16 + ? ranges_length * 2 - 1 + : ranges_length * 2; +} + +bool Equals(const ZoneList<CharacterRange>* lhs, + const Handle<FixedUInt16Array>& rhs) { + const int rhs_length = rhs->length(); + if (rhs_length != RangeArrayLengthFor(lhs)) return false; + for (int i = 0; i < lhs->length(); i++) { + const CharacterRange& r = lhs->at(i); + if (rhs->get(i * 2 + 0) != r.from()) return false; + if (i * 2 + 1 == rhs_length) break; + if (rhs->get(i * 2 + 1) != r.to() + 1) return false; + } + return true; +} + +Handle<FixedUInt16Array> MakeRangeArray( + Isolate* isolate, const ZoneList<CharacterRange>* ranges) { + const int ranges_length = ranges->length(); + const int range_array_length = RangeArrayLengthFor(ranges); + Handle<FixedUInt16Array> range_array = + FixedUInt16Array::New(isolate, range_array_length); + for (int i = 0; i < ranges_length; i++) { + const CharacterRange& r = ranges->at(i); + DCHECK_LE(r.from(), kMaxUInt16); + range_array->set(i * 2 + 0, r.from()); + const base::uc32 to = MaskEndOfRangeMarker(r.to()); + if (i == ranges_length - 1 && to == kMaxUInt16) { + DCHECK_EQ(range_array_length, ranges_length * 2 - 1); + break; // Avoid overflow by leaving the last range open-ended. + } + DCHECK_LT(to, kMaxUInt16); + range_array->set(i * 2 + 1, to + 1); // Exclusive. + } + return range_array; +} + +} // namespace + +Handle<ByteArray> NativeRegExpMacroAssembler::GetOrAddRangeArray( + const ZoneList<CharacterRange>* ranges) { + const uint32_t hash = Hash(ranges); + + if (range_array_cache_.count(hash) != 0) { + Handle<FixedUInt16Array> range_array = range_array_cache_[hash]; + if (Equals(ranges, range_array)) return range_array; + } + + Handle<FixedUInt16Array> range_array = MakeRangeArray(isolate(), ranges); + range_array_cache_[hash] = range_array; + return range_array; +} + +// static +uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char, + Address raw_byte_array, + Isolate* isolate) { + // Use uint32_t to avoid complexity around bool return types (which may be + // optimized to use only the least significant byte). + static constexpr uint32_t kTrue = 1; + static constexpr uint32_t kFalse = 0; + + FixedUInt16Array ranges = FixedUInt16Array::cast(Object(raw_byte_array)); + DCHECK_GE(ranges.length(), 1); + + // Shortcut for fully out of range chars. + if (current_char < ranges.get(0)) return kFalse; + if (current_char >= ranges.get(ranges.length() - 1)) { + // The last range may be open-ended. + return (ranges.length() % 2) == 0 ? kFalse : kTrue; + } + + // Binary search for the matching range. `ranges` is encoded as + // [from0, to0, from1, to1, ..., fromN, toN], or + // [from0, to0, from1, to1, ..., fromN] (open-ended last interval). + + int mid, lower = 0, upper = ranges.length(); + do { + mid = lower + (upper - lower) / 2; + const base::uc16 elem = ranges.get(mid); + if (current_char < elem) { + upper = mid; + } else if (current_char > elem) { + lower = mid + 1; + } else { + DCHECK_EQ(current_char, elem); + break; + } + } while (lower < upper); + + const bool current_char_ge_last_elem = current_char >= ranges.get(mid); + const int current_range_start_index = + current_char_ge_last_elem ? mid : mid - 1; + + // Ranges start at even indices and end at odd indices. + return (current_range_start_index % 2) == 0 ? kTrue : kFalse; +} + +void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, + Label* on_failure) { + Label ok; + // Check that current character is not a trail surrogate. + LoadCurrentCharacter(cp_offset, &ok); + CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok); + // Check that previous character is not a lead surrogate. + LoadCurrentCharacter(cp_offset - 1, &ok); + CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure); + Bind(&ok); +} + +void RegExpMacroAssembler::CheckPosition(int cp_offset, + Label* on_outside_input) { + LoadCurrentCharacter(cp_offset, on_outside_input, true); +} + +void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset, + Label* on_end_of_input, + bool check_bounds, + int characters, + int eats_at_least) { + // By default, eats_at_least = characters. + if (eats_at_least == kUseCharactersValue) { + eats_at_least = characters; + } + + LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters, + eats_at_least); +} + +void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl( + int cp_offset, Label* on_end_of_input, bool check_bounds, int characters, + int eats_at_least) { + // It's possible to preload a small number of characters when each success + // path requires a large number of characters, but not the reverse. + DCHECK_GE(eats_at_least, characters); + + DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset)); + if (check_bounds) { + if (cp_offset >= 0) { + CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input); + } else { + CheckPosition(cp_offset, on_end_of_input); + } + } + LoadCurrentCharacterUnchecked(cp_offset, characters); +} + +bool NativeRegExpMacroAssembler::CanReadUnaligned() const { + return v8_flags.enable_regexp_unaligned_accesses && !slow_safe(); +} + +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +// This method may only be called after an interrupt. +// static +int NativeRegExpMacroAssembler::CheckStackGuardState( + Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, + Address* return_address, InstructionStream re_code, Address* subject, + const byte** input_start, const byte** input_end) { + DisallowGarbageCollection no_gc; + Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0); + DCHECK_LE(re_code.instruction_start(), old_pc); + DCHECK_LE(old_pc, re_code.code(kAcquireLoad).instruction_end()); + + StackLimitCheck check(isolate); + bool js_has_overflowed = check.JsHasOverflowed(); + + if (call_origin == RegExp::CallOrigin::kFromJs) { + // Direct calls from JavaScript can be interrupted in two ways: + // 1. A real stack overflow, in which case we let the caller throw the + // exception. + // 2. The stack guard was used to interrupt execution for another purpose, + // forcing the call through the runtime system. + + // Bug(v8:9540) Investigate why this method is called from JS although no + // stackoverflow or interrupt is pending on ARM64. We return 0 in this case + // to continue execution normally. + if (js_has_overflowed) { + return EXCEPTION; + } else if (check.InterruptRequested()) { + return RETRY; + } else { + return 0; + } + } + DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime); + + // Prepare for possible GC. + HandleScope handles(isolate); + Handle<InstructionStream> code_handle(re_code, isolate); + Handle<String> subject_handle(String::cast(Object(*subject)), isolate); + bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle); + int return_value = 0; + + { + DisableGCMole no_gc_mole; + if (js_has_overflowed) { + AllowGarbageCollection yes_gc; + isolate->StackOverflow(); + return_value = EXCEPTION; + } else if (check.InterruptRequested()) { + AllowGarbageCollection yes_gc; + Object result = isolate->stack_guard()->HandleInterrupts(); + if (result.IsException(isolate)) return_value = EXCEPTION; + } + + // We are not using operator == here because it does a slow DCHECK + // CheckObjectComparisonAllowed() which might crash when trying to access + // the page header of the stale pointer. + if (!code_handle->SafeEquals(re_code)) { // Return address no longer valid + // Overwrite the return address on the stack. + intptr_t delta = code_handle->address() - re_code.address(); + Address new_pc = old_pc + delta; + // TODO(v8:10026): avoid replacing a signed pointer. + PointerAuthentication::ReplacePC(return_address, new_pc, 0); + } + } + + // If we continue, we need to update the subject string addresses. + if (return_value == 0) { + // String encoding might have changed. + if (String::IsOneByteRepresentationUnderneath(*subject_handle) != + is_one_byte) { + // If we changed between an LATIN1 and an UC16 string, the specialized + // code cannot be used, and we need to restart regexp matching from + // scratch (including, potentially, compiling a new version of the code). + return_value = RETRY; + } else { + *subject = subject_handle->ptr(); + intptr_t byte_length = *input_end - *input_start; + *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc); + *input_end = *input_start + byte_length; + } + } + return return_value; +} + +// Returns a {Result} sentinel, or the number of successful matches. +int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp, + Handle<String> subject, + int* offsets_vector, + int offsets_vector_length, + int previous_index, Isolate* isolate) { + DCHECK(subject->IsFlat()); + DCHECK_LE(0, previous_index); + DCHECK_LE(previous_index, subject->length()); + + // No allocations before calling the regexp, but we can't use + // DisallowGarbageCollection, since regexps might be preempted, and another + // thread might do allocation anyway. + + String subject_ptr = *subject; + // Character offsets into string. + int start_offset = previous_index; + int char_length = subject_ptr.length() - start_offset; + int slice_offset = 0; + + // The string has been flattened, so if it is a cons string it contains the + // full string in the first part. + if (StringShape(subject_ptr).IsCons()) { + DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length()); + subject_ptr = ConsString::cast(subject_ptr).first(); + } else if (StringShape(subject_ptr).IsSliced()) { + SlicedString slice = SlicedString::cast(subject_ptr); + subject_ptr = slice.parent(); + slice_offset = slice.offset(); + } + if (StringShape(subject_ptr).IsThin()) { + subject_ptr = ThinString::cast(subject_ptr).actual(); + } + // Ensure that an underlying string has the same representation. + bool is_one_byte = subject_ptr.IsOneByteRepresentation(); + DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString()); + // String is now either Sequential or External + int char_size_shift = is_one_byte ? 0 : 1; + + DisallowGarbageCollection no_gc; + const byte* input_start = + subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc); + int byte_length = char_length << char_size_shift; + const byte* input_end = input_start + byte_length; + return Execute(*subject, start_offset, input_start, input_end, offsets_vector, + offsets_vector_length, isolate, *regexp); +} + +// static +int NativeRegExpMacroAssembler::ExecuteForTesting( + String input, int start_offset, const byte* input_start, + const byte* input_end, int* output, int output_size, Isolate* isolate, + JSRegExp regexp) { + return Execute(input, start_offset, input_start, input_end, output, + output_size, isolate, regexp); +} + +// Returns a {Result} sentinel, or the number of successful matches. +// TODO(pthier): The JSRegExp object is passed to native irregexp code to match +// the signature of the interpreter. We should get rid of JS objects passed to +// internal methods. +int NativeRegExpMacroAssembler::Execute( + String input, // This needs to be the unpacked (sliced, cons) string. + int start_offset, const byte* input_start, const byte* input_end, + int* output, int output_size, Isolate* isolate, JSRegExp regexp) { + RegExpStackScope stack_scope(isolate); + + bool is_one_byte = String::IsOneByteRepresentationUnderneath(input); + Code code = Code::cast(regexp.code(is_one_byte)); + RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime; + + using RegexpMatcherSig = + // NOLINTNEXTLINE(readability/casting) + int(Address input_string, int start_offset, const byte* input_start, + const byte* input_end, int* output, int output_size, int call_origin, + Isolate* isolate, Address regexp); + + auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(isolate, code); + int result = fn.Call(input.ptr(), start_offset, input_start, input_end, + output, output_size, call_origin, isolate, regexp.ptr()); + DCHECK_GE(result, SMALLEST_REGEXP_RESULT); + + if (result == EXCEPTION && !isolate->has_pending_exception()) { + // We detected a stack overflow (on the backtrack stack) in RegExp code, + // but haven't created the exception yet. Additionally, we allow heap + // allocation because even though it invalidates {input_start} and + // {input_end}, we are about to return anyway. + AllowGarbageCollection allow_allocation; + isolate->StackOverflow(); + } + return result; +} + +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +// clang-format off +const byte NativeRegExpMacroAssembler::word_character_map[] = { + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7' + 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9' + + 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W' + 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_' + + 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o' + 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w' + 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' + // Latin-1 range + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, +}; +// clang-format on + +// static +Address NativeRegExpMacroAssembler::GrowStack(Isolate* isolate) { + DisallowGarbageCollection no_gc; + + RegExpStack* regexp_stack = isolate->regexp_stack(); + const size_t old_size = regexp_stack->memory_size(); + +#ifdef DEBUG + const Address old_stack_top = regexp_stack->memory_top(); + const Address old_stack_pointer = regexp_stack->stack_pointer(); + CHECK_LE(old_stack_pointer, old_stack_top); + CHECK_LE(static_cast<size_t>(old_stack_top - old_stack_pointer), old_size); +#endif // DEBUG + + Address new_stack_base = regexp_stack->EnsureCapacity(old_size * 2); + if (new_stack_base == kNullAddress) return kNullAddress; + + return regexp_stack->stack_pointer(); +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-macro-assembler.h b/js/src/irregexp/imported/regexp-macro-assembler.h new file mode 100644 index 0000000000..651f6cb580 --- /dev/null +++ b/js/src/irregexp/imported/regexp-macro-assembler.h @@ -0,0 +1,361 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ +#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ + +#include "irregexp/imported/regexp-ast.h" +#include "irregexp/imported/regexp.h" +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class ByteArray; +class JSRegExp; +class Label; +class String; + +static const base::uc32 kLeadSurrogateStart = 0xd800; +static const base::uc32 kLeadSurrogateEnd = 0xdbff; +static const base::uc32 kTrailSurrogateStart = 0xdc00; +static const base::uc32 kTrailSurrogateEnd = 0xdfff; +static const base::uc32 kNonBmpStart = 0x10000; +static const base::uc32 kNonBmpEnd = 0x10ffff; + +class RegExpMacroAssembler { + public: + // The implementation must be able to handle at least: + static constexpr int kMaxRegisterCount = (1 << 16); + static constexpr int kMaxRegister = kMaxRegisterCount - 1; + static constexpr int kMaxCaptures = (kMaxRegister - 1) / 2; + static constexpr int kMaxCPOffset = (1 << 15) - 1; + static constexpr int kMinCPOffset = -(1 << 15); + + static constexpr int kTableSizeBits = 7; + static constexpr int kTableSize = 1 << kTableSizeBits; + static constexpr int kTableMask = kTableSize - 1; + + static constexpr int kUseCharactersValue = -1; + + RegExpMacroAssembler(Isolate* isolate, Zone* zone); + virtual ~RegExpMacroAssembler() = default; + + virtual Handle<HeapObject> GetCode(Handle<String> source) = 0; + + // This function is called when code generation is aborted, so that + // the assembler could clean up internal data structures. + virtual void AbortedCodeGeneration() {} + // The maximal number of pushes between stack checks. Users must supply + // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck) + // at least once for every stack_limit() pushes that are executed. + virtual int stack_limit_slack() = 0; + virtual bool CanReadUnaligned() const = 0; + + virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change. + virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by. + // Continues execution from the position pushed on the top of the backtrack + // stack by an earlier PushBacktrack(Label*). + virtual void Backtrack() = 0; + virtual void Bind(Label* label) = 0; + // Dispatch after looking the current character up in a 2-bits-per-entry + // map. The destinations vector has up to 4 labels. + virtual void CheckCharacter(unsigned c, Label* on_equal) = 0; + // Bitwise and the current character with the given constant and then + // check for a match with c. + virtual void CheckCharacterAfterAnd(unsigned c, + unsigned and_with, + Label* on_equal) = 0; + virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0; + virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0; + virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0; + virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0; + virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0; + virtual void CheckNotBackReference(int start_reg, bool read_backward, + Label* on_no_match) = 0; + virtual void CheckNotBackReferenceIgnoreCase(int start_reg, + bool read_backward, bool unicode, + Label* on_no_match) = 0; + // Check the current character for a match with a literal character. If we + // fail to match then goto the on_failure label. End of input always + // matches. If the label is nullptr then we should pop a backtrack address + // off the stack and go to that. + virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0; + virtual void CheckNotCharacterAfterAnd(unsigned c, + unsigned and_with, + Label* on_not_equal) = 0; + // Subtract a constant from the current character, then and with the given + // constant and then check for a match with c. + virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 and_with, + Label* on_not_equal) = 0; + virtual void CheckCharacterInRange(base::uc16 from, + base::uc16 to, // Both inclusive. + Label* on_in_range) = 0; + virtual void CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, // Both inclusive. + Label* on_not_in_range) = 0; + // Returns true if the check was emitted, false otherwise. + virtual bool CheckCharacterInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0; + virtual bool CheckCharacterNotInRangeArray( + const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0; + + // The current character (modulus the kTableSize) is looked up in the byte + // array, and if the found byte is non-zero, we jump to the on_bit_set label. + virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0; + + // Checks whether the given offset from the current position is before + // the end of the string. May overwrite the current character. + virtual void CheckPosition(int cp_offset, Label* on_outside_input); + // Check whether a standard/default character class matches the current + // character. Returns false if the type of special character class does + // not have custom support. + // May clobber the current loaded character. + virtual bool CheckSpecialClassRanges(StandardCharacterSet type, + Label* on_no_match) { + return false; + } + + // Control-flow integrity: + // Define a jump target and bind a label. + virtual void BindJumpTarget(Label* label) { Bind(label); } + + virtual void Fail() = 0; + virtual void GoTo(Label* label) = 0; + // Check whether a register is >= a given constant and go to a label if it + // is. Backtracks instead if the label is nullptr. + virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0; + // Check whether a register is < a given constant and go to a label if it is. + // Backtracks instead if the label is nullptr. + virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0; + // Check whether a register is == to the current position and go to a + // label if it is. + virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0; + V8_EXPORT_PRIVATE void LoadCurrentCharacter( + int cp_offset, Label* on_end_of_input, bool check_bounds = true, + int characters = 1, int eats_at_least = kUseCharactersValue); + virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, + bool check_bounds, int characters, + int eats_at_least) = 0; + virtual void PopCurrentPosition() = 0; + virtual void PopRegister(int register_index) = 0; + // Pushes the label on the backtrack stack, so that a following Backtrack + // will go to this label. Always checks the backtrack stack limit. + virtual void PushBacktrack(Label* label) = 0; + virtual void PushCurrentPosition() = 0; + enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true }; + virtual void PushRegister(int register_index, + StackCheckFlag check_stack_limit) = 0; + virtual void ReadCurrentPositionFromRegister(int reg) = 0; + virtual void ReadStackPointerFromRegister(int reg) = 0; + virtual void SetCurrentPositionFromEnd(int by) = 0; + virtual void SetRegister(int register_index, int to) = 0; + // Return whether the matching (with a global regexp) will be restarted. + virtual bool Succeed() = 0; + virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; + virtual void ClearRegisters(int reg_from, int reg_to) = 0; + virtual void WriteStackPointerToRegister(int reg) = 0; + + // Check that we are not in the middle of a surrogate pair. + void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); + +#define IMPLEMENTATIONS_LIST(V) \ + V(IA32) \ + V(ARM) \ + V(ARM64) \ + V(MIPS) \ + V(LOONG64) \ + V(RISCV) \ + V(RISCV32) \ + V(S390) \ + V(PPC) \ + V(X64) \ + V(Bytecode) + + enum IrregexpImplementation { +#define V(Name) k##Name##Implementation, + IMPLEMENTATIONS_LIST(V) +#undef V + }; + + inline const char* ImplementationToString(IrregexpImplementation impl) { + static const char* const kNames[] = { +#define V(Name) #Name, + IMPLEMENTATIONS_LIST(V) +#undef V + }; + return kNames[impl]; + } +#undef IMPLEMENTATIONS_LIST + virtual IrregexpImplementation Implementation() = 0; + + // Compare two-byte strings case insensitively. + // + // Called from generated code. + static int CaseInsensitiveCompareNonUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate); + static int CaseInsensitiveCompareUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate); + + // `raw_byte_array` is a ByteArray containing a set of character ranges, + // where ranges are encoded as uint16_t elements: + // + // [from0, to0, from1, to1, ..., fromN, toN], or + // [from0, to0, from1, to1, ..., fromN] (open-ended last interval). + // + // fromN is inclusive, toN is exclusive. Returns zero if not in a range, + // non-zero otherwise. + // + // Called from generated code. + static uint32_t IsCharacterInRangeArray(uint32_t current_char, + Address raw_byte_array, + Isolate* isolate); + + // Controls the generation of large inlined constants in the code. + void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } + bool slow_safe() const { return slow_safe_compiler_; } + + // Controls after how many backtracks irregexp should abort execution. If it + // can fall back to the experimental engine (see `set_can_fallback`), it will + // return the appropriate error code, otherwise it will return the number of + // matches found so far (perhaps none). + void set_backtrack_limit(uint32_t backtrack_limit) { + backtrack_limit_ = backtrack_limit; + } + + // Set whether or not irregexp can fall back to the experimental engine on + // excessive backtracking. The number of backtracks considered excessive can + // be controlled with set_backtrack_limit. + void set_can_fallback(bool val) { can_fallback_ = val; } + + enum GlobalMode { + NOT_GLOBAL, + GLOBAL_NO_ZERO_LENGTH_CHECK, + GLOBAL, + GLOBAL_UNICODE + }; + // Set whether the regular expression has the global flag. Exiting due to + // a failure in a global regexp may still mean success overall. + inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } + inline bool global() const { return global_mode_ != NOT_GLOBAL; } + inline bool global_with_zero_length_check() const { + return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; + } + inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; } + + Isolate* isolate() const { return isolate_; } + Zone* zone() const { return zone_; } + + protected: + bool has_backtrack_limit() const; + uint32_t backtrack_limit() const { return backtrack_limit_; } + + bool can_fallback() const { return can_fallback_; } + + private: + bool slow_safe_compiler_; + uint32_t backtrack_limit_; + bool can_fallback_ = false; + GlobalMode global_mode_; + Isolate* const isolate_; + Zone* const zone_; +}; + +class NativeRegExpMacroAssembler: public RegExpMacroAssembler { + public: + // Type of input string to generate code for. + enum Mode { LATIN1 = 1, UC16 = 2 }; + + // Result of calling generated native RegExp code. + // RETRY: Something significant changed during execution, and the matching + // should be retried from scratch. + // EXCEPTION: Something failed during execution. If no exception has been + // thrown, it's an internal out-of-memory, and the caller should + // throw the exception. + // FAILURE: Matching failed. + // SUCCESS: Matching succeeded, and the output array has been filled with + // capture positions. + // FALLBACK_TO_EXPERIMENTAL: Execute the regexp on this subject using the + // experimental engine instead. + enum Result { + FAILURE = RegExp::kInternalRegExpFailure, + SUCCESS = RegExp::kInternalRegExpSuccess, + EXCEPTION = RegExp::kInternalRegExpException, + RETRY = RegExp::kInternalRegExpRetry, + FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental, + SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult, + }; + + NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone) + : RegExpMacroAssembler(isolate, zone), range_array_cache_(zone) {} + ~NativeRegExpMacroAssembler() override = default; + + // Returns a {Result} sentinel, or the number of successful matches. + static int Match(Handle<JSRegExp> regexp, Handle<String> subject, + int* offsets_vector, int offsets_vector_length, + int previous_index, Isolate* isolate); + + V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset, + const byte* input_start, + const byte* input_end, + int* output, int output_size, + Isolate* isolate, + JSRegExp regexp); + + bool CanReadUnaligned() const override; + + void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, + bool check_bounds, int characters, + int eats_at_least) override; + // Load a number of characters at the given offset from the + // current position, into the current-character register. + virtual void LoadCurrentCharacterUnchecked(int cp_offset, + int character_count) = 0; + + // Called from RegExp if the backtrack stack limit is hit. Tries to expand + // the stack. Returns the new stack-pointer if successful, or returns 0 if + // unable to grow the stack. + // This function must not trigger a garbage collection. + // + // Called from generated code. + static Address GrowStack(Isolate* isolate); + + // Called from generated code. + static int CheckStackGuardState(Isolate* isolate, int start_index, + RegExp::CallOrigin call_origin, + Address* return_address, + InstructionStream re_code, Address* subject, + const byte** input_start, + const byte** input_end); + + static Address word_character_map_address() { + return reinterpret_cast<Address>(&word_character_map[0]); + } + + protected: + // Byte map of one byte characters with a 0xff if the character is a word + // character (digit, letter or underscore) and 0x00 otherwise. + // Used by generated RegExp code. + static const byte word_character_map[256]; + + Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges); + + private: + // Returns a {Result} sentinel, or the number of successful matches. + static int Execute(String input, int start_offset, const byte* input_start, + const byte* input_end, int* output, int output_size, + Isolate* isolate, JSRegExp regexp); + + ZoneUnorderedMap<uint32_t, Handle<FixedUInt16Array>> range_array_cache_; +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ diff --git a/js/src/irregexp/imported/regexp-nodes.h b/js/src/irregexp/imported/regexp-nodes.h new file mode 100644 index 0000000000..9407f1c5ec --- /dev/null +++ b/js/src/irregexp/imported/regexp-nodes.h @@ -0,0 +1,775 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_NODES_H_ +#define V8_REGEXP_REGEXP_NODES_H_ + +#include "irregexp/imported/regexp-macro-assembler.h" + +namespace v8 { +namespace internal { + +class AlternativeGenerationList; +class BoyerMooreLookahead; +class GreedyLoopState; +class NodeVisitor; +class QuickCheckDetails; +class RegExpCompiler; +class Trace; +struct PreloadState; +class ChoiceNode; + +#define FOR_EACH_NODE_TYPE(VISIT) \ + VISIT(End) \ + VISIT(Action) \ + VISIT(Choice) \ + VISIT(LoopChoice) \ + VISIT(NegativeLookaroundChoice) \ + VISIT(BackReference) \ + VISIT(Assertion) \ + VISIT(Text) + +struct NodeInfo final { + NodeInfo() + : being_analyzed(false), + been_analyzed(false), + follows_word_interest(false), + follows_newline_interest(false), + follows_start_interest(false), + at_end(false), + visited(false), + replacement_calculated(false) {} + + // Returns true if the interests and assumptions of this node + // matches the given one. + bool Matches(NodeInfo* that) { + return (at_end == that->at_end) && + (follows_word_interest == that->follows_word_interest) && + (follows_newline_interest == that->follows_newline_interest) && + (follows_start_interest == that->follows_start_interest); + } + + // Updates the interests of this node given the interests of the + // node preceding it. + void AddFromPreceding(NodeInfo* that) { + at_end |= that->at_end; + follows_word_interest |= that->follows_word_interest; + follows_newline_interest |= that->follows_newline_interest; + follows_start_interest |= that->follows_start_interest; + } + + bool HasLookbehind() { + return follows_word_interest || follows_newline_interest || + follows_start_interest; + } + + // Sets the interests of this node to include the interests of the + // following node. + void AddFromFollowing(NodeInfo* that) { + follows_word_interest |= that->follows_word_interest; + follows_newline_interest |= that->follows_newline_interest; + follows_start_interest |= that->follows_start_interest; + } + + void ResetCompilationState() { + being_analyzed = false; + been_analyzed = false; + } + + bool being_analyzed : 1; + bool been_analyzed : 1; + + // These bits are set of this node has to know what the preceding + // character was. + bool follows_word_interest : 1; + bool follows_newline_interest : 1; + bool follows_start_interest : 1; + + bool at_end : 1; + bool visited : 1; + bool replacement_calculated : 1; +}; + +struct EatsAtLeastInfo final { + EatsAtLeastInfo() : EatsAtLeastInfo(0) {} + explicit EatsAtLeastInfo(uint8_t eats) + : eats_at_least_from_possibly_start(eats), + eats_at_least_from_not_start(eats) {} + void SetMin(const EatsAtLeastInfo& other) { + if (other.eats_at_least_from_possibly_start < + eats_at_least_from_possibly_start) { + eats_at_least_from_possibly_start = + other.eats_at_least_from_possibly_start; + } + if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) { + eats_at_least_from_not_start = other.eats_at_least_from_not_start; + } + } + + bool IsZero() const { + return eats_at_least_from_possibly_start == 0 && + eats_at_least_from_not_start == 0; + } + + // Any successful match starting from the current node will consume at least + // this many characters. This does not necessarily mean that there is a + // possible match with exactly this many characters, but we generally try to + // get this number as high as possible to allow for early exit on failure. + uint8_t eats_at_least_from_possibly_start; + + // Like eats_at_least_from_possibly_start, but with the additional assumption + // that start-of-string assertions (^) can't match. This value is greater than + // or equal to eats_at_least_from_possibly_start. + uint8_t eats_at_least_from_not_start; +}; + +class RegExpNode : public ZoneObject { + public: + explicit RegExpNode(Zone* zone) + : replacement_(nullptr), + on_work_list_(false), + trace_count_(0), + zone_(zone) { + bm_info_[0] = bm_info_[1] = nullptr; + } + virtual ~RegExpNode(); + virtual void Accept(NodeVisitor* visitor) = 0; + // Generates a goto to this node or actually generates the code at this point. + virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; + // How many characters must this node consume at a minimum in order to + // succeed. The not_at_start argument is used to indicate that we know we are + // not at the start of the input. In this case anchored branches will always + // fail and can be ignored when determining how many characters are consumed + // on success. If this node has not been analyzed yet, EatsAtLeast returns 0. + int EatsAtLeast(bool not_at_start); + // Returns how many characters this node must consume in order to succeed, + // given that this is a LoopChoiceNode whose counter register is in a + // newly-initialized state at the current position in the generated code. For + // example, consider /a{6,8}/. Absent any extra information, the + // LoopChoiceNode for the repetition must report that it consumes at least + // zero characters, because it may have already looped several times. However, + // with a newly-initialized counter, it can report that it consumes at least + // six characters. + virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry(); + // Emits some quick code that checks whether the preloaded characters match. + // Falls through on certain failure, jumps to the label on possible success. + // If the node cannot make a quick check it does nothing and returns false. + bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace, + Trace* trace, bool preload_has_checked_bounds, + Label* on_possible_success, + QuickCheckDetails* details_return, + bool fall_through_on_failure, ChoiceNode* predecessor); + // For a given number of characters this returns a mask and a value. The + // next n characters are anded with the mask and compared with the value. + // A comparison failure indicates the node cannot match the next n characters. + // A comparison success indicates the node may match. + virtual void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start) = 0; + // Fills in quick check details for this node, given that this is a + // LoopChoiceNode whose counter register is in a newly-initialized state at + // the current position in the generated code. For example, consider /a{6,8}/. + // Absent any extra information, the LoopChoiceNode for the repetition cannot + // generate any useful quick check because a match might be the (empty) + // continuation node. However, with a newly-initialized counter, it can + // generate a quick check for several 'a' characters at once. + virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start); + static const int kNodeIsTooComplexForGreedyLoops = kMinInt; + virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } + // Only returns the successor for a text node of length 1 that matches any + // character and that has no guards on it. + virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( + RegExpCompiler* compiler) { + return nullptr; + } + + // Collects information on the possible code units (mod 128) that can match if + // we look forward. This is used for a Boyer-Moore-like string searching + // implementation. TODO(erikcorry): This should share more code with + // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit + // the number of nodes we are willing to look at in order to create this data. + static const int kRecursionBudget = 200; + bool KeepRecursing(RegExpCompiler* compiler); + virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) { + UNREACHABLE(); + } + + // If we know that the input is one-byte then there are some nodes that can + // never match. This method returns a node that can be substituted for + // itself, or nullptr if the node can never match. + virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) { + return this; + } + // Helper for FilterOneByte. + RegExpNode* replacement() { + DCHECK(info()->replacement_calculated); + return replacement_; + } + RegExpNode* set_replacement(RegExpNode* replacement) { + info()->replacement_calculated = true; + replacement_ = replacement; + return replacement; // For convenience. + } + + // We want to avoid recalculating the lookahead info, so we store it on the + // node. Only info that is for this node is stored. We can tell that the + // info is for this node when offset == 0, so the information is calculated + // relative to this node. + void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { + if (offset == 0) set_bm_info(not_at_start, bm); + } + + Label* label() { return &label_; } + // If non-generic code is generated for a node (i.e. the node is not at the + // start of the trace) then it cannot be reused. This variable sets a limit + // on how often we allow that to happen before we insist on starting a new + // trace and generating generic code for a node that can be reused by flushing + // the deferred actions in the current trace and generating a goto. + static const int kMaxCopiesCodeGenerated = 10; + + bool on_work_list() { return on_work_list_; } + void set_on_work_list(bool value) { on_work_list_ = value; } + + NodeInfo* info() { return &info_; } + const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; } + void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) { + eats_at_least_ = eats_at_least; + } + + // TODO(v8:10441): This is a hacky way to avoid exponential code size growth + // for very large choice nodes that can be generated by unicode property + // escapes. In order to avoid inlining (i.e. trace recursion), we pretend to + // have generated the maximum count of code copies already. + // We should instead fix this properly, e.g. by using the code size budget + // (flush_budget) or by generating property escape matches as calls to a C + // function. + void SetDoNotInline() { trace_count_ = kMaxCopiesCodeGenerated; } + + BoyerMooreLookahead* bm_info(bool not_at_start) { + return bm_info_[not_at_start ? 1 : 0]; + } + + Zone* zone() const { return zone_; } + + protected: + enum LimitResult { DONE, CONTINUE }; + RegExpNode* replacement_; + + LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); + + void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { + bm_info_[not_at_start ? 1 : 0] = bm; + } + + private: + static const int kFirstCharBudget = 10; + Label label_; + bool on_work_list_; + NodeInfo info_; + + // Saved values for EatsAtLeast results, to avoid recomputation. Filled in + // during analysis (valid if info_.been_analyzed is true). + EatsAtLeastInfo eats_at_least_; + + // This variable keeps track of how many times code has been generated for + // this node (in different traces). We don't keep track of where the + // generated code is located unless the code is generated at the start of + // a trace, in which case it is generic and can be reused by flushing the + // deferred operations in the current trace and generating a goto. + int trace_count_; + BoyerMooreLookahead* bm_info_[2]; + + Zone* zone_; +}; + +class SeqRegExpNode : public RegExpNode { + public: + explicit SeqRegExpNode(RegExpNode* on_success) + : RegExpNode(on_success->zone()), on_success_(on_success) {} + RegExpNode* on_success() { return on_success_; } + void set_on_success(RegExpNode* node) { on_success_ = node; } + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override { + on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); + if (offset == 0) set_bm_info(not_at_start, bm); + } + + protected: + RegExpNode* FilterSuccessor(int depth, RegExpFlags flags); + + private: + RegExpNode* on_success_; +}; + +class ActionNode : public SeqRegExpNode { + public: + enum ActionType { + SET_REGISTER_FOR_LOOP, + INCREMENT_REGISTER, + STORE_POSITION, + BEGIN_POSITIVE_SUBMATCH, + BEGIN_NEGATIVE_SUBMATCH, + POSITIVE_SUBMATCH_SUCCESS, + EMPTY_MATCH_CHECK, + CLEAR_CAPTURES + }; + static ActionNode* SetRegisterForLoop(int reg, int val, + RegExpNode* on_success); + static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); + static ActionNode* StorePosition(int reg, bool is_capture, + RegExpNode* on_success); + static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); + static ActionNode* BeginPositiveSubmatch(int stack_pointer_reg, + int position_reg, + RegExpNode* on_success); + static ActionNode* BeginNegativeSubmatch(int stack_pointer_reg, + int position_reg, + RegExpNode* on_success); + static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, + int restore_reg, + int clear_capture_count, + int clear_capture_from, + RegExpNode* on_success); + static ActionNode* EmptyMatchCheck(int start_register, + int repetition_register, + int repetition_limit, + RegExpNode* on_success); + void Accept(NodeVisitor* visitor) override; + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int filled_in, + bool not_at_start) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override; + ActionType action_type() { return action_type_; } + // TODO(erikcorry): We should allow some action nodes in greedy loops. + int GreedyLoopTextLength() override { + return kNodeIsTooComplexForGreedyLoops; + } + + private: + union { + struct { + int reg; + int value; + } u_store_register; + struct { + int reg; + } u_increment_register; + struct { + int reg; + bool is_capture; + } u_position_register; + struct { + int stack_pointer_register; + int current_position_register; + int clear_register_count; + int clear_register_from; + } u_submatch; + struct { + int start_register; + int repetition_register; + int repetition_limit; + } u_empty_match_check; + struct { + int range_from; + int range_to; + } u_clear_captures; + } data_; + ActionNode(ActionType action_type, RegExpNode* on_success) + : SeqRegExpNode(on_success), action_type_(action_type) {} + ActionType action_type_; + friend class DotPrinterImpl; + friend Zone; +}; + +class TextNode : public SeqRegExpNode { + public: + TextNode(ZoneList<TextElement>* elms, bool read_backward, + RegExpNode* on_success) + : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {} + TextNode(RegExpClassRanges* that, bool read_backward, RegExpNode* on_success) + : SeqRegExpNode(on_success), + elms_(zone()->New<ZoneList<TextElement>>(1, zone())), + read_backward_(read_backward) { + elms_->Add(TextElement::ClassRanges(that), zone()); + } + // Create TextNode for a single character class for the given ranges. + static TextNode* CreateForCharacterRanges(Zone* zone, + ZoneList<CharacterRange>* ranges, + bool read_backward, + RegExpNode* on_success); + // Create TextNode for a surrogate pair (i.e. match a sequence of two uc16 + // code unit ranges). + static TextNode* CreateForSurrogatePair( + Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges, + bool read_backward, RegExpNode* on_success); + static TextNode* CreateForSurrogatePair(Zone* zone, + ZoneList<CharacterRange>* lead_ranges, + CharacterRange trail, + bool read_backward, + RegExpNode* on_success); + void Accept(NodeVisitor* visitor) override; + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int characters_filled_in, + bool not_at_start) override; + ZoneList<TextElement>* elements() { return elms_; } + bool read_backward() { return read_backward_; } + void MakeCaseIndependent(Isolate* isolate, bool is_one_byte, + RegExpFlags flags); + int GreedyLoopTextLength() override; + RegExpNode* GetSuccessorOfOmnivorousTextNode( + RegExpCompiler* compiler) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override; + void CalculateOffsets(); + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; + int Length(); + + private: + enum TextEmitPassType { + NON_LATIN1_MATCH, // Check for characters that can't match. + SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. + NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. + CASE_CHARACTER_MATCH, // Case-independent single character check. + CHARACTER_CLASS_MATCH // Character class. + }; + static bool SkipPass(TextEmitPassType pass, bool ignore_case); + static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; + static const int kLastPass = CHARACTER_CLASS_MATCH; + void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, + bool preloaded, Trace* trace, bool first_element_checked, + int* checked_up_to); + ZoneList<TextElement>* elms_; + bool read_backward_; +}; + +class AssertionNode : public SeqRegExpNode { + public: + enum AssertionType { + AT_END, + AT_START, + AT_BOUNDARY, + AT_NON_BOUNDARY, + AFTER_NEWLINE + }; + static AssertionNode* AtEnd(RegExpNode* on_success) { + return on_success->zone()->New<AssertionNode>(AT_END, on_success); + } + static AssertionNode* AtStart(RegExpNode* on_success) { + return on_success->zone()->New<AssertionNode>(AT_START, on_success); + } + static AssertionNode* AtBoundary(RegExpNode* on_success) { + return on_success->zone()->New<AssertionNode>(AT_BOUNDARY, on_success); + } + static AssertionNode* AtNonBoundary(RegExpNode* on_success) { + return on_success->zone()->New<AssertionNode>(AT_NON_BOUNDARY, on_success); + } + static AssertionNode* AfterNewline(RegExpNode* on_success) { + return on_success->zone()->New<AssertionNode>(AFTER_NEWLINE, on_success); + } + void Accept(NodeVisitor* visitor) override; + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int filled_in, + bool not_at_start) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override; + AssertionType assertion_type() { return assertion_type_; } + + private: + friend Zone; + + void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); + enum IfPrevious { kIsNonWord, kIsWord }; + void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace, + IfPrevious backtrack_if_previous); + AssertionNode(AssertionType t, RegExpNode* on_success) + : SeqRegExpNode(on_success), assertion_type_(t) {} + AssertionType assertion_type_; +}; + +class BackReferenceNode : public SeqRegExpNode { + public: + BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags, + bool read_backward, RegExpNode* on_success) + : SeqRegExpNode(on_success), + start_reg_(start_reg), + end_reg_(end_reg), + flags_(flags), + read_backward_(read_backward) {} + void Accept(NodeVisitor* visitor) override; + int start_register() { return start_reg_; } + int end_register() { return end_reg_; } + bool read_backward() { return read_backward_; } + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int characters_filled_in, + bool not_at_start) override { + return; + } + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override; + + private: + int start_reg_; + int end_reg_; + RegExpFlags flags_; + bool read_backward_; +}; + +class EndNode : public RegExpNode { + public: + enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; + EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {} + void Accept(NodeVisitor* visitor) override; + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int characters_filled_in, + bool not_at_start) override { + // Returning 0 from EatsAtLeast should ensure we never get here. + UNREACHABLE(); + } + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override { + // Returning 0 from EatsAtLeast should ensure we never get here. + UNREACHABLE(); + } + + private: + Action action_; +}; + +class NegativeSubmatchSuccess : public EndNode { + public: + NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, + int clear_capture_count, int clear_capture_start, + Zone* zone) + : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), + stack_pointer_register_(stack_pointer_reg), + current_position_register_(position_reg), + clear_capture_count_(clear_capture_count), + clear_capture_start_(clear_capture_start) {} + void Emit(RegExpCompiler* compiler, Trace* trace) override; + + private: + int stack_pointer_register_; + int current_position_register_; + int clear_capture_count_; + int clear_capture_start_; +}; + +class Guard : public ZoneObject { + public: + enum Relation { LT, GEQ }; + Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {} + int reg() { return reg_; } + Relation op() { return op_; } + int value() { return value_; } + + private: + int reg_; + Relation op_; + int value_; +}; + +class GuardedAlternative { + public: + explicit GuardedAlternative(RegExpNode* node) + : node_(node), guards_(nullptr) {} + void AddGuard(Guard* guard, Zone* zone); + RegExpNode* node() { return node_; } + void set_node(RegExpNode* node) { node_ = node; } + ZoneList<Guard*>* guards() { return guards_; } + + private: + RegExpNode* node_; + ZoneList<Guard*>* guards_; +}; + +class AlternativeGeneration; + +class ChoiceNode : public RegExpNode { + public: + explicit ChoiceNode(int expected_size, Zone* zone) + : RegExpNode(zone), + alternatives_( + zone->New<ZoneList<GuardedAlternative>>(expected_size, zone)), + not_at_start_(false), + being_calculated_(false) {} + void Accept(NodeVisitor* visitor) override; + void AddAlternative(GuardedAlternative node) { + alternatives()->Add(node, zone()); + } + ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int characters_filled_in, + bool not_at_start) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override; + + bool being_calculated() { return being_calculated_; } + bool not_at_start() { return not_at_start_; } + void set_not_at_start() { not_at_start_ = true; } + void set_being_calculated(bool b) { being_calculated_ = b; } + virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { + return true; + } + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; + virtual bool read_backward() { return false; } + + protected: + int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); + ZoneList<GuardedAlternative>* alternatives_; + + private: + template <typename...> + friend class Analysis; + + void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, + Trace* trace); + int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); + void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, + GuardedAlternative alternative, + AlternativeGeneration* alt_gen, + int preload_characters, + bool next_expects_preload); + void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, + PreloadState* preloads); + void AssertGuardsMentionRegisters(Trace* trace); + int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace); + Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, + AlternativeGenerationList* alt_gens, + PreloadState* preloads, + GreedyLoopState* greedy_loop_state, int text_length); + void EmitChoices(RegExpCompiler* compiler, + AlternativeGenerationList* alt_gens, int first_choice, + Trace* trace, PreloadState* preloads); + + // If true, this node is never checked at the start of the input. + // Allows a new trace to start with at_start() set to false. + bool not_at_start_; + bool being_calculated_; +}; + +class NegativeLookaroundChoiceNode : public ChoiceNode { + public: + explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail, + GuardedAlternative then_do_this, + Zone* zone) + : ChoiceNode(2, zone) { + AddAlternative(this_must_fail); + AddAlternative(then_do_this); + } + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int characters_filled_in, + bool not_at_start) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override { + continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm, + not_at_start); + if (offset == 0) set_bm_info(not_at_start, bm); + } + static constexpr int kLookaroundIndex = 0; + static constexpr int kContinueIndex = 1; + RegExpNode* lookaround_node() { + return alternatives()->at(kLookaroundIndex).node(); + } + RegExpNode* continue_node() { + return alternatives()->at(kContinueIndex).node(); + } + // For a negative lookahead we don't emit the quick check for the + // alternative that is expected to fail. This is because quick check code + // starts by loading enough characters for the alternative that takes fewest + // characters, but on a negative lookahead the negative branch did not take + // part in that calculation (EatsAtLeast) so the assumptions don't hold. + bool try_to_emit_quick_check_for_alternative(bool is_first) override { + return !is_first; + } + void Accept(NodeVisitor* visitor) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; +}; + +class LoopChoiceNode : public ChoiceNode { + public: + LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, + int min_loop_iterations, Zone* zone) + : ChoiceNode(2, zone), + loop_node_(nullptr), + continue_node_(nullptr), + body_can_be_zero_length_(body_can_be_zero_length), + read_backward_(read_backward), + traversed_loop_initialization_node_(false), + min_loop_iterations_(min_loop_iterations) {} + void AddLoopAlternative(GuardedAlternative alt); + void AddContinueAlternative(GuardedAlternative alt); + void Emit(RegExpCompiler* compiler, Trace* trace) override; + void GetQuickCheckDetails(QuickCheckDetails* details, + RegExpCompiler* compiler, int characters_filled_in, + bool not_at_start) override; + void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, + RegExpCompiler* compiler, + int characters_filled_in, + bool not_at_start) override; + void FillInBMInfo(Isolate* isolate, int offset, int budget, + BoyerMooreLookahead* bm, bool not_at_start) override; + EatsAtLeastInfo EatsAtLeastFromLoopEntry() override; + RegExpNode* loop_node() { return loop_node_; } + RegExpNode* continue_node() { return continue_node_; } + bool body_can_be_zero_length() { return body_can_be_zero_length_; } + int min_loop_iterations() const { return min_loop_iterations_; } + bool read_backward() override { return read_backward_; } + void Accept(NodeVisitor* visitor) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; + + private: + // AddAlternative is made private for loop nodes because alternatives + // should not be added freely, we need to keep track of which node + // goes back to the node itself. + void AddAlternative(GuardedAlternative node) { + ChoiceNode::AddAlternative(node); + } + + RegExpNode* loop_node_; + RegExpNode* continue_node_; + bool body_can_be_zero_length_; + bool read_backward_; + + // Temporary marker set only while generating quick check details. Represents + // whether GetQuickCheckDetails traversed the initialization node for this + // loop's counter. If so, we may be able to generate stricter quick checks + // because we know the loop node must match at least min_loop_iterations_ + // times before the continuation node can match. + bool traversed_loop_initialization_node_; + + // The minimum number of times the loop_node_ must match before the + // continue_node_ might be considered. This value can be temporarily decreased + // while generating quick check details, to represent the remaining iterations + // after the completed portion of the quick check details. + int min_loop_iterations_; + + friend class IterationDecrementer; + friend class LoopInitializationMarker; +}; + +class NodeVisitor { + public: + virtual ~NodeVisitor() = default; +#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0; + FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_NODES_H_ diff --git a/js/src/irregexp/imported/regexp-parser.cc b/js/src/irregexp/imported/regexp-parser.cc new file mode 100644 index 0000000000..57f4c12fc5 --- /dev/null +++ b/js/src/irregexp/imported/regexp-parser.cc @@ -0,0 +1,3131 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-parser.h" + +#include "irregexp/imported/regexp-ast.h" +#include "irregexp/imported/regexp-macro-assembler.h" +#include "irregexp/imported/regexp.h" + +#ifdef V8_INTL_SUPPORT +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/usetiter.h" +#include "unicode/utf16.h" // For U16_NEXT +#endif // V8_INTL_SUPPORT + +namespace v8 { +namespace internal { + +namespace { + +// Whether we're currently inside the ClassEscape production +// (tc39.es/ecma262/#prod-annexB-CharacterEscape). +enum class InClassEscapeState { + kInClass, + kNotInClass, +}; + +// The production used to derive ClassSetOperand. +enum class ClassSetOperandType { + kClassSetCharacter, + kClassStringDisjunction, + kNestedClass, + kCharacterClassEscape, // \ CharacterClassEscape is a special nested class, + // as we can fold it directly into another range. + kClassSetRange +}; + +class RegExpTextBuilder { + public: + using SmallRegExpTreeVector = + base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>; + + RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage, + RegExpFlags flags) + : zone_(zone), + flags_(flags), + terms_(terms_storage), + text_(ZoneAllocator<RegExpTree*>{zone}) {} + void AddCharacter(base::uc16 character); + void AddUnicodeCharacter(base::uc32 character); + void AddEscapedUnicodeCharacter(base::uc32 character); + void AddAtom(RegExpTree* atom); + void AddTerm(RegExpTree* term); + void AddClassRanges(RegExpClassRanges* cc); + void FlushPendingSurrogate(); + void FlushText(); + RegExpTree* PopLastAtom(); + RegExpTree* ToRegExp(); + + private: + static const base::uc16 kNoPendingSurrogate = 0; + + void AddLeadSurrogate(base::uc16 lead_surrogate); + void AddTrailSurrogate(base::uc16 trail_surrogate); + void FlushCharacters(); + bool NeedsDesugaringForUnicode(RegExpClassRanges* cc); + bool NeedsDesugaringForIgnoreCase(base::uc32 c); + void AddClassRangesForDesugaring(base::uc32 c); + bool ignore_case() const { return IsIgnoreCase(flags_); } + bool IsUnicodeMode() const { + // Either /v or /u enable UnicodeMode + // TODO(v8:11935): Change permalink once proposal is in stage 4. + // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern + return IsUnicode(flags_) || IsUnicodeSets(flags_); + } + Zone* zone() const { return zone_; } + + Zone* const zone_; + const RegExpFlags flags_; + ZoneList<base::uc16>* characters_ = nullptr; + base::uc16 pending_surrogate_ = kNoPendingSurrogate; + SmallRegExpTreeVector* terms_; + SmallRegExpTreeVector text_; +}; + +void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) { + DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); + FlushPendingSurrogate(); + // Hold onto the lead surrogate, waiting for a trail surrogate to follow. + pending_surrogate_ = lead_surrogate; +} + +void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) { + DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); + if (pending_surrogate_ != kNoPendingSurrogate) { + base::uc16 lead_surrogate = pending_surrogate_; + pending_surrogate_ = kNoPendingSurrogate; + DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); + base::uc32 combined = + unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); + if (NeedsDesugaringForIgnoreCase(combined)) { + AddClassRangesForDesugaring(combined); + } else { + ZoneList<base::uc16> surrogate_pair(2, zone()); + surrogate_pair.Add(lead_surrogate, zone()); + surrogate_pair.Add(trail_surrogate, zone()); + RegExpAtom* atom = + zone()->New<RegExpAtom>(surrogate_pair.ToConstVector()); + AddAtom(atom); + } + } else { + pending_surrogate_ = trail_surrogate; + FlushPendingSurrogate(); + } +} + +void RegExpTextBuilder::FlushPendingSurrogate() { + if (pending_surrogate_ != kNoPendingSurrogate) { + DCHECK(IsUnicodeMode()); + base::uc32 c = pending_surrogate_; + pending_surrogate_ = kNoPendingSurrogate; + AddClassRangesForDesugaring(c); + } +} + +void RegExpTextBuilder::FlushCharacters() { + FlushPendingSurrogate(); + if (characters_ != nullptr) { + RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector()); + characters_ = nullptr; + text_.emplace_back(atom); + } +} + +void RegExpTextBuilder::FlushText() { + FlushCharacters(); + size_t num_text = text_.size(); + if (num_text == 0) { + return; + } else if (num_text == 1) { + terms_->emplace_back(text_.back()); + } else { + RegExpText* text = zone()->New<RegExpText>(zone()); + for (size_t i = 0; i < num_text; i++) { + text_[i]->AppendToText(text, zone()); + } + terms_->emplace_back(text); + } + text_.clear(); +} + +void RegExpTextBuilder::AddCharacter(base::uc16 c) { + FlushPendingSurrogate(); + if (NeedsDesugaringForIgnoreCase(c)) { + AddClassRangesForDesugaring(c); + } else { + if (characters_ == nullptr) { + characters_ = zone()->New<ZoneList<base::uc16>>(4, zone()); + } + characters_->Add(c, zone()); + } +} + +void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) { + if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { + DCHECK(IsUnicodeMode()); + AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); + AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); + } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) { + AddLeadSurrogate(c); + } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) { + AddTrailSurrogate(c); + } else { + AddCharacter(static_cast<base::uc16>(c)); + } +} + +void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { + // A lead or trail surrogate parsed via escape sequence will not + // pair up with any preceding lead or following trail surrogate. + FlushPendingSurrogate(); + AddUnicodeCharacter(character); + FlushPendingSurrogate(); +} + +void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) { + if (NeedsDesugaringForUnicode(cr)) { + // With /u or /v, character class needs to be desugared, so it + // must be a standalone term instead of being part of a RegExpText. + AddTerm(cr); + } else { + AddAtom(cr); + } +} + +void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) { + AddTerm(zone()->New<RegExpClassRanges>( + zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)))); +} + +void RegExpTextBuilder::AddAtom(RegExpTree* atom) { + DCHECK(atom->IsTextElement()); + FlushCharacters(); + text_.emplace_back(atom); +} + +void RegExpTextBuilder::AddTerm(RegExpTree* term) { + DCHECK(term->IsTextElement()); + FlushText(); + terms_->emplace_back(term); +} + +bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) { + if (!IsUnicodeMode()) return false; + // TODO(yangguo): we could be smarter than this. Case-insensitivity does not + // necessarily mean that we need to desugar. It's probably nicer to have a + // separate pass to figure out unicode desugarings. + if (ignore_case()) return true; + ZoneList<CharacterRange>* ranges = cc->ranges(zone()); + CharacterRange::Canonicalize(ranges); + + if (cc->is_negated()) { + ZoneList<CharacterRange>* negated_ranges = + zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone()); + CharacterRange::Negate(ranges, negated_ranges, zone()); + ranges = negated_ranges; + } + + for (int i = ranges->length() - 1; i >= 0; i--) { + base::uc32 from = ranges->at(i).from(); + base::uc32 to = ranges->at(i).to(); + // Check for non-BMP characters. + if (to >= kNonBmpStart) return true; + // Check for lone surrogates. + if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; + } + return false; +} + +bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) { +#ifdef V8_INTL_SUPPORT + if (IsUnicodeMode() && ignore_case()) { + icu::UnicodeSet set(c, c); + set.closeOver(USET_CASE_INSENSITIVE); + set.removeAllStrings(); + return set.size() > 1; + } + // In the case where ICU is not included, we act as if the unicode flag is + // not set, and do not desugar. +#endif // V8_INTL_SUPPORT + return false; +} + +RegExpTree* RegExpTextBuilder::PopLastAtom() { + FlushPendingSurrogate(); + RegExpTree* atom; + if (characters_ != nullptr) { + base::Vector<const base::uc16> char_vector = characters_->ToConstVector(); + int num_chars = char_vector.length(); + if (num_chars > 1) { + base::Vector<const base::uc16> prefix = + char_vector.SubVector(0, num_chars - 1); + text_.emplace_back(zone()->New<RegExpAtom>(prefix)); + char_vector = char_vector.SubVector(num_chars - 1, num_chars); + } + characters_ = nullptr; + atom = zone()->New<RegExpAtom>(char_vector); + return atom; + } else if (text_.size() > 0) { + atom = text_.back(); + text_.pop_back(); + return atom; + } + return nullptr; +} + +RegExpTree* RegExpTextBuilder::ToRegExp() { + FlushText(); + size_t num_alternatives = terms_->size(); + if (num_alternatives == 0) return zone()->New<RegExpEmpty>(); + if (num_alternatives == 1) return terms_->back(); + return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( + base::VectorOf(terms_->begin(), terms_->size()), zone())); +} + +// Accumulates RegExp atoms and assertions into lists of terms and alternatives. +class RegExpBuilder { + public: + RegExpBuilder(Zone* zone, RegExpFlags flags) + : zone_(zone), + flags_(flags), + terms_(ZoneAllocator<RegExpTree*>{zone}), + alternatives_(ZoneAllocator<RegExpTree*>{zone}), + text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {} + void AddCharacter(base::uc16 character); + void AddUnicodeCharacter(base::uc32 character); + void AddEscapedUnicodeCharacter(base::uc32 character); + // "Adds" an empty expression. Does nothing except consume a + // following quantifier + void AddEmpty(); + void AddClassRanges(RegExpClassRanges* cc); + void AddAtom(RegExpTree* tree); + void AddTerm(RegExpTree* tree); + void AddAssertion(RegExpTree* tree); + void NewAlternative(); // '|' + bool AddQuantifierToAtom(int min, int max, + RegExpQuantifier::QuantifierType type); + void FlushText(); + RegExpTree* ToRegExp(); + RegExpFlags flags() const { return flags_; } + + bool ignore_case() const { return IsIgnoreCase(flags_); } + bool multiline() const { return IsMultiline(flags_); } + bool dotall() const { return IsDotAll(flags_); } + + private: + void FlushTerms(); + bool IsUnicodeMode() const { + // Either /v or /u enable UnicodeMode + // TODO(v8:11935): Change permalink once proposal is in stage 4. + // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern + return IsUnicode(flags_) || IsUnicodeSets(flags_); + } + Zone* zone() const { return zone_; } + RegExpTextBuilder& text_builder() { return text_builder_; } + + Zone* const zone_; + bool pending_empty_ = false; + const RegExpFlags flags_; + + using SmallRegExpTreeVector = + base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>; + SmallRegExpTreeVector terms_; + SmallRegExpTreeVector alternatives_; + RegExpTextBuilder text_builder_; +}; + +enum SubexpressionType { + INITIAL, + CAPTURE, // All positive values represent captures. + POSITIVE_LOOKAROUND, + NEGATIVE_LOOKAROUND, + GROUPING +}; + +class RegExpParserState : public ZoneObject { + public: + // Push a state on the stack. + RegExpParserState(RegExpParserState* previous_state, + SubexpressionType group_type, + RegExpLookaround::Type lookaround_type, + int disjunction_capture_index, + const ZoneVector<base::uc16>* capture_name, + RegExpFlags flags, Zone* zone) + : previous_state_(previous_state), + builder_(zone, flags), + group_type_(group_type), + lookaround_type_(lookaround_type), + disjunction_capture_index_(disjunction_capture_index), + capture_name_(capture_name) {} + // Parser state of containing expression, if any. + RegExpParserState* previous_state() const { return previous_state_; } + bool IsSubexpression() { return previous_state_ != nullptr; } + // RegExpBuilder building this regexp's AST. + RegExpBuilder* builder() { return &builder_; } + // Type of regexp being parsed (parenthesized group or entire regexp). + SubexpressionType group_type() const { return group_type_; } + // Lookahead or Lookbehind. + RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } + // Index in captures array of first capture in this sub-expression, if any. + // Also the capture index of this sub-expression itself, if group_type + // is CAPTURE. + int capture_index() const { return disjunction_capture_index_; } + // The name of the current sub-expression, if group_type is CAPTURE. Only + // used for named captures. + const ZoneVector<base::uc16>* capture_name() const { return capture_name_; } + + bool IsNamedCapture() const { return capture_name_ != nullptr; } + + // Check whether the parser is inside a capture group with the given index. + bool IsInsideCaptureGroup(int index) const { + for (const RegExpParserState* s = this; s != nullptr; + s = s->previous_state()) { + if (s->group_type() != CAPTURE) continue; + // Return true if we found the matching capture index. + if (index == s->capture_index()) return true; + // Abort if index is larger than what has been parsed up till this state. + if (index > s->capture_index()) return false; + } + return false; + } + + // Check whether the parser is inside a capture group with the given name. + bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const { + DCHECK_NOT_NULL(name); + for (const RegExpParserState* s = this; s != nullptr; + s = s->previous_state()) { + if (s->capture_name() == nullptr) continue; + if (*s->capture_name() == *name) return true; + } + return false; + } + + private: + // Linked list implementation of stack of states. + RegExpParserState* const previous_state_; + // Builder for the stored disjunction. + RegExpBuilder builder_; + // Stored disjunction type (capture, look-ahead or grouping), if any. + const SubexpressionType group_type_; + // Stored read direction. + const RegExpLookaround::Type lookaround_type_; + // Stored disjunction's capture index (if any). + const int disjunction_capture_index_; + // Stored capture name (if any). + const ZoneVector<base::uc16>* const capture_name_; +}; + +template <class CharT> +class RegExpParserImpl final { + private: + RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags, + uintptr_t stack_limit, Zone* zone, + const DisallowGarbageCollection& no_gc); + + bool Parse(RegExpCompileData* result); + + RegExpTree* ParsePattern(); + RegExpTree* ParseDisjunction(); + RegExpTree* ParseGroup(); + + // Parses a {...,...} quantifier and stores the range in the given + // out parameters. + bool ParseIntervalQuantifier(int* min_out, int* max_out); + + // Checks whether the following is a length-digit hexadecimal number, + // and sets the value if it is. + bool ParseHexEscape(int length, base::uc32* value); + bool ParseUnicodeEscape(base::uc32* value); + bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value); + + bool ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2); + bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range, + CharacterClassStrings* add_to_strings, bool negate, + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2); + + RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges, + bool add_unicode_case_equivalents); + // Parse inside a class. Either add escaped class to the range, or return + // false and pass parsed single character through |char_out|. + void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, + bool add_unicode_case_equivalents, base::uc32* char_out, + bool* is_class_escape); + // Returns true iff parsing was successful. + bool TryParseCharacterClassEscape(base::uc32 next, + InClassEscapeState in_class_escape_state, + ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings, Zone* zone, + bool add_unicode_case_equivalents); + RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings); + RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, + ClassSetOperandType* type_out); + RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder, + ClassSetOperandType* type_out, + ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings); + base::uc32 ParseClassSetCharacter(); + // Parses and returns a single escaped character. + base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state, + bool* is_escaped_unicode_character); + + RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated, + RegExpTree* first_operand, + ClassSetOperandType first_operand_type, + ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings); + RegExpTree* ParseClassIntersection(const RegExpBuilder* builder, + bool is_negated, RegExpTree* first_operand, + ClassSetOperandType first_operand_type); + RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder, + bool is_negated, RegExpTree* first_operand, + ClassSetOperandType first_operand_type); + RegExpTree* ParseCharacterClass(const RegExpBuilder* state); + + base::uc32 ParseOctalLiteral(); + + // Tries to parse the input as a back reference. If successful it + // stores the result in the output parameter and returns true. If + // it fails it will push back the characters read so the same characters + // can be reparsed. + bool ParseBackReferenceIndex(int* index_out); + + RegExpTree* ReportError(RegExpError error); + void Advance(); + void Advance(int dist); + void RewindByOneCodepoint(); // Rewinds to before the previous Advance(). + void Reset(int pos); + + // Reports whether the pattern might be used as a literal search string. + // Only use if the result of the parse is a single atom node. + bool simple() const { return simple_; } + bool contains_anchor() const { return contains_anchor_; } + void set_contains_anchor() { contains_anchor_ = true; } + int captures_started() const { return captures_started_; } + int position() const { return next_pos_ - 1; } + bool failed() const { return failed_; } + RegExpFlags flags() const { return top_level_flags_; } + bool IsUnicodeMode() const { + // Either /v or /u enable UnicodeMode + // TODO(v8:11935): Change permalink once proposal is in stage 4. + // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-parsepattern + return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_; + } + bool unicode_sets() const { return IsUnicodeSets(flags()); } + bool ignore_case() const { return IsIgnoreCase(flags()); } + + static bool IsSyntaxCharacterOrSlash(base::uc32 c); + static bool IsClassSetSyntaxCharacter(base::uc32 c); + static bool IsClassSetReservedPunctuator(base::uc32 c); + bool IsClassSetReservedDoublePunctuator(base::uc32 c); + + static const base::uc32 kEndMarker = (1 << 21); + + private: + // Return the 1-indexed RegExpCapture object, allocate if necessary. + RegExpCapture* GetCapture(int index); + + // Creates a new named capture at the specified index. Must be called exactly + // once for each named capture. Fails if a capture with the same name is + // encountered. + bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index); + + // Parses the name of a capture group (?<name>pattern). The name must adhere + // to IdentifierName in the ECMAScript standard. + const ZoneVector<base::uc16>* ParseCaptureGroupName(); + + bool ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state); + RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); + + // After the initial parsing pass, patch corresponding RegExpCapture objects + // into all RegExpBackReferences. This is done after initial parsing in order + // to avoid complicating cases in which references comes before the capture. + void PatchNamedBackReferences(); + + ZoneVector<RegExpCapture*>* GetNamedCaptures() const; + + // Returns true iff the pattern contains named captures. May call + // ScanForCaptures to look ahead at the remaining pattern. + bool HasNamedCaptures(InClassEscapeState in_class_escape_state); + + Zone* zone() const { return zone_; } + + base::uc32 current() const { return current_; } + bool has_more() const { return has_more_; } + bool has_next() const { return next_pos_ < input_length(); } + base::uc32 Next(); + template <bool update_position> + base::uc32 ReadNext(); + CharT InputAt(int index) const { + DCHECK(0 <= index && index < input_length()); + return input_[index]; + } + int input_length() const { return input_length_; } + void ScanForCaptures(InClassEscapeState in_class_escape_state); + + struct RegExpCaptureNameLess { + bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { + DCHECK_NOT_NULL(lhs); + DCHECK_NOT_NULL(rhs); + return *lhs->name() < *rhs->name(); + } + }; + + class ForceUnicodeScope final { + public: + explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser) + : parser_(parser) { + DCHECK(!parser_->force_unicode_); + parser_->force_unicode_ = true; + } + ~ForceUnicodeScope() { + DCHECK(parser_->force_unicode_); + parser_->force_unicode_ = false; + } + + private: + RegExpParserImpl<CharT>* const parser_; + }; + + const DisallowGarbageCollection no_gc_; + Zone* const zone_; + RegExpError error_ = RegExpError::kNone; + int error_pos_ = 0; + ZoneList<RegExpCapture*>* captures_; + ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; + ZoneList<RegExpBackReference*>* named_back_references_; + const CharT* const input_; + const int input_length_; + base::uc32 current_; + const RegExpFlags top_level_flags_; + bool force_unicode_ = false; // Force parser to act as if unicode were set. + int next_pos_; + int captures_started_; + int capture_count_; // Only valid after we have scanned for captures. + bool has_more_; + bool simple_; + bool contains_anchor_; + bool is_scanned_for_captures_; + bool has_named_captures_; // Only valid after we have scanned for captures. + bool failed_; + const uintptr_t stack_limit_; + + friend class v8::internal::RegExpParser; +}; + +template <class CharT> +RegExpParserImpl<CharT>::RegExpParserImpl( + const CharT* input, int input_length, RegExpFlags flags, + uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc) + : zone_(zone), + captures_(nullptr), + named_captures_(nullptr), + named_back_references_(nullptr), + input_(input), + input_length_(input_length), + current_(kEndMarker), + top_level_flags_(flags), + next_pos_(0), + captures_started_(0), + capture_count_(0), + has_more_(true), + simple_(false), + contains_anchor_(false), + is_scanned_for_captures_(false), + has_named_captures_(false), + failed_(false), + stack_limit_(stack_limit) { + Advance(); +} + +template <> +template <bool update_position> +inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() { + int position = next_pos_; + base::uc16 c0 = InputAt(position); + position++; + DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0)); + if (update_position) next_pos_ = position; + return c0; +} + +template <> +template <bool update_position> +inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() { + int position = next_pos_; + base::uc16 c0 = InputAt(position); + base::uc32 result = c0; + position++; + // Read the whole surrogate pair in case of unicode mode, if possible. + if (IsUnicodeMode() && position < input_length() && + unibrow::Utf16::IsLeadSurrogate(c0)) { + base::uc16 c1 = InputAt(position); + if (unibrow::Utf16::IsTrailSurrogate(c1)) { + result = unibrow::Utf16::CombineSurrogatePair(c0, c1); + position++; + } + } + if (update_position) next_pos_ = position; + return result; +} + +template <class CharT> +base::uc32 RegExpParserImpl<CharT>::Next() { + if (has_next()) { + return ReadNext<false>(); + } else { + return kEndMarker; + } +} + +template <class CharT> +void RegExpParserImpl<CharT>::Advance() { + if (has_next()) { + if (GetCurrentStackPosition() < stack_limit_) { + if (v8_flags.correctness_fuzzer_suppressions) { + FATAL("Aborting on stack overflow"); + } + ReportError(RegExpError::kStackOverflow); + } else { + current_ = ReadNext<true>(); + } + } else { + current_ = kEndMarker; + // Advance so that position() points to 1-after-the-last-character. This is + // important so that Reset() to this position works correctly. + next_pos_ = input_length() + 1; + has_more_ = false; + } +} + +template <class CharT> +void RegExpParserImpl<CharT>::RewindByOneCodepoint() { + if (!has_more()) return; + // Rewinds by one code point, i.e.: two code units if `current` is outside + // the basic multilingual plane (= composed of a lead and trail surrogate), + // or one code unit otherwise. + const int rewind_by = + current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1; + Advance(rewind_by); // Undo the last Advance. +} + +template <class CharT> +void RegExpParserImpl<CharT>::Reset(int pos) { + next_pos_ = pos; + has_more_ = (pos < input_length()); + Advance(); +} + +template <class CharT> +void RegExpParserImpl<CharT>::Advance(int dist) { + next_pos_ += dist - 1; + Advance(); +} + +// static +template <class CharT> +bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) { + switch (c) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '|': + case '/': + return true; + default: + break; + } + return false; +} + +// static +template <class CharT> +bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) { + switch (c) { + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '/': + case '-': + case '\\': + case '|': + return true; + default: + break; + } + return false; +} + +// static +template <class CharT> +bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) { + switch (c) { + case '&': + case '-': + case '!': + case '#': + case '%': + case ',': + case ':': + case ';': + case '<': + case '=': + case '>': + case '@': + case '`': + case '~': + return true; + default: + break; + } + return false; +} + +template <class CharT> +bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) { +#define DOUBLE_PUNCTUATOR_CASE(Char) \ + case Char: \ + return Next() == Char + + switch (c) { + DOUBLE_PUNCTUATOR_CASE('&'); + DOUBLE_PUNCTUATOR_CASE('!'); + DOUBLE_PUNCTUATOR_CASE('#'); + DOUBLE_PUNCTUATOR_CASE('$'); + DOUBLE_PUNCTUATOR_CASE('%'); + DOUBLE_PUNCTUATOR_CASE('*'); + DOUBLE_PUNCTUATOR_CASE('+'); + DOUBLE_PUNCTUATOR_CASE(','); + DOUBLE_PUNCTUATOR_CASE('.'); + DOUBLE_PUNCTUATOR_CASE(':'); + DOUBLE_PUNCTUATOR_CASE(';'); + DOUBLE_PUNCTUATOR_CASE('<'); + DOUBLE_PUNCTUATOR_CASE('='); + DOUBLE_PUNCTUATOR_CASE('>'); + DOUBLE_PUNCTUATOR_CASE('?'); + DOUBLE_PUNCTUATOR_CASE('@'); + DOUBLE_PUNCTUATOR_CASE('^'); + DOUBLE_PUNCTUATOR_CASE('`'); + DOUBLE_PUNCTUATOR_CASE('~'); + default: + break; + } +#undef DOUBLE_PUNCTUATOR_CASE + + return false; +} + +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) { + if (failed_) return nullptr; // Do not overwrite any existing error. + failed_ = true; + error_ = error; + error_pos_ = position(); + // Zip to the end to make sure no more input is read. + current_ = kEndMarker; + next_pos_ = input_length(); + has_more_ = false; + return nullptr; +} + +#define CHECK_FAILED /**/); \ + if (failed_) return nullptr; \ + ((void)0 + +// Pattern :: +// Disjunction +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParsePattern() { + RegExpTree* result = ParseDisjunction(CHECK_FAILED); + PatchNamedBackReferences(CHECK_FAILED); + DCHECK(!has_more()); + // If the result of parsing is a literal string atom, and it has the + // same length as the input, then the atom is identical to the input. + if (result->IsAtom() && result->AsAtom()->length() == input_length()) { + simple_ = true; + } + return result; +} + +// Disjunction :: +// Alternative +// Alternative | Disjunction +// Alternative :: +// [empty] +// Term Alternative +// Term :: +// Assertion +// Atom +// Atom Quantifier +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { + // Used to store current state while parsing subexpressions. + RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, + 0, nullptr, flags(), zone()); + RegExpParserState* state = &initial_state; + // Cache the builder in a local variable for quick access. + RegExpBuilder* builder = initial_state.builder(); + while (true) { + switch (current()) { + case kEndMarker: + if (failed()) return nullptr; // E.g. the initial Advance failed. + if (state->IsSubexpression()) { + // Inside a parenthesized group when hitting end of input. + return ReportError(RegExpError::kUnterminatedGroup); + } + DCHECK_EQ(INITIAL, state->group_type()); + // Parsing completed successfully. + return builder->ToRegExp(); + case ')': { + if (!state->IsSubexpression()) { + return ReportError(RegExpError::kUnmatchedParen); + } + DCHECK_NE(INITIAL, state->group_type()); + + Advance(); + // End disjunction parsing and convert builder content to new single + // regexp atom. + RegExpTree* body = builder->ToRegExp(); + + int end_capture_index = captures_started(); + + int capture_index = state->capture_index(); + SubexpressionType group_type = state->group_type(); + + // Build result of subexpression. + if (group_type == CAPTURE) { + if (state->IsNamedCapture()) { + CreateNamedCaptureAtIndex(state->capture_name(), + capture_index CHECK_FAILED); + } + RegExpCapture* capture = GetCapture(capture_index); + capture->set_body(body); + body = capture; + } else if (group_type == GROUPING) { + body = zone()->template New<RegExpGroup>(body); + } else { + DCHECK(group_type == POSITIVE_LOOKAROUND || + group_type == NEGATIVE_LOOKAROUND); + bool is_positive = (group_type == POSITIVE_LOOKAROUND); + body = zone()->template New<RegExpLookaround>( + body, is_positive, end_capture_index - capture_index, + capture_index, state->lookaround_type()); + } + + // Restore previous state. + state = state->previous_state(); + builder = state->builder(); + + builder->AddAtom(body); + // For compatibility with JSC and ES3, we allow quantifiers after + // lookaheads, and break in all cases. + break; + } + case '|': { + Advance(); + builder->NewAlternative(); + continue; + } + case '*': + case '+': + case '?': + return ReportError(RegExpError::kNothingToRepeat); + case '^': { + Advance(); + builder->AddAssertion(zone()->template New<RegExpAssertion>( + builder->multiline() ? RegExpAssertion::Type::START_OF_LINE + : RegExpAssertion::Type::START_OF_INPUT)); + set_contains_anchor(); + continue; + } + case '$': { + Advance(); + RegExpAssertion::Type assertion_type = + builder->multiline() ? RegExpAssertion::Type::END_OF_LINE + : RegExpAssertion::Type::END_OF_INPUT; + builder->AddAssertion( + zone()->template New<RegExpAssertion>(assertion_type)); + continue; + } + case '.': { + Advance(); + ZoneList<CharacterRange>* ranges = + zone()->template New<ZoneList<CharacterRange>>(2, zone()); + + if (builder->dotall()) { + // Everything. + CharacterRange::AddClassEscape(StandardCharacterSet::kEverything, + ranges, false, zone()); + } else { + // Everything except \x0A, \x0D, \u2028 and \u2029. + CharacterRange::AddClassEscape( + StandardCharacterSet::kNotLineTerminator, ranges, false, zone()); + } + + RegExpClassRanges* cc = + zone()->template New<RegExpClassRanges>(zone(), ranges); + builder->AddClassRanges(cc); + break; + } + case '(': { + state = ParseOpenParenthesis(state CHECK_FAILED); + builder = state->builder(); + continue; + } + case '[': { + RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); + if (cc->IsClassRanges()) { + builder->AddClassRanges(cc->AsClassRanges()); + } else { + DCHECK(cc->IsClassSetExpression()); + builder->AddTerm(cc); + } + break; + } + // Atom :: + // \ AtomEscape + case '\\': + switch (Next()) { + case kEndMarker: + return ReportError(RegExpError::kEscapeAtEndOfPattern); + // AtomEscape :: + // [+UnicodeMode] DecimalEscape + // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber + // of DecimalEscape is ≤ NcapturingParens + // CharacterEscape (some cases of this mixed in too) + // + // TODO(jgruber): It may make sense to disentangle all the different + // cases and make the structure mirror the spec, e.g. for AtomEscape: + // + // if (TryParseDecimalEscape(...)) return; + // if (TryParseCharacterClassEscape(...)) return; + // if (TryParseCharacterEscape(...)) return; + // if (TryParseGroupName(...)) return; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + int index = 0; + const bool is_backref = + ParseBackReferenceIndex(&index CHECK_FAILED); + if (is_backref) { + if (state->IsInsideCaptureGroup(index)) { + // The back reference is inside the capture group it refers to. + // Nothing can possibly have been captured yet, so we use empty + // instead. This ensures that, when checking a back reference, + // the capture registers of the referenced capture are either + // both set or both cleared. + builder->AddEmpty(); + } else { + RegExpCapture* capture = GetCapture(index); + RegExpTree* atom = zone()->template New<RegExpBackReference>( + capture, builder->flags()); + builder->AddAtom(atom); + } + break; + } + // With /u and /v, no identity escapes except for syntax characters + // are allowed. Otherwise, all identity escapes are allowed. + if (IsUnicodeMode()) { + return ReportError(RegExpError::kInvalidEscape); + } + base::uc32 first_digit = Next(); + if (first_digit == '8' || first_digit == '9') { + builder->AddCharacter(first_digit); + Advance(2); + break; + } + V8_FALLTHROUGH; + } + case '0': { + Advance(); + if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') { + // Decimal escape with leading 0 are not parsed as octal. + return ReportError(RegExpError::kInvalidDecimalEscape); + } + base::uc32 octal = ParseOctalLiteral(); + builder->AddCharacter(octal); + break; + } + case 'b': + Advance(2); + builder->AddAssertion(zone()->template New<RegExpAssertion>( + RegExpAssertion::Type::BOUNDARY)); + continue; + case 'B': + Advance(2); + builder->AddAssertion(zone()->template New<RegExpAssertion>( + RegExpAssertion::Type::NON_BOUNDARY)); + continue; + // AtomEscape :: + // CharacterClassEscape + case 'd': + case 'D': + case 's': + case 'S': + case 'w': + case 'W': { + base::uc32 next = Next(); + ZoneList<CharacterRange>* ranges = + zone()->template New<ZoneList<CharacterRange>>(2, zone()); + bool add_unicode_case_equivalents = + IsUnicodeMode() && ignore_case(); + bool parsed_character_class_escape = TryParseCharacterClassEscape( + next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(), + add_unicode_case_equivalents CHECK_FAILED); + + if (parsed_character_class_escape) { + RegExpClassRanges* cc = + zone()->template New<RegExpClassRanges>(zone(), ranges); + builder->AddClassRanges(cc); + } else { + CHECK(!IsUnicodeMode()); + Advance(2); + builder->AddCharacter(next); // IdentityEscape. + } + break; + } + case 'p': + case 'P': { + base::uc32 next = Next(); + ZoneList<CharacterRange>* ranges = + zone()->template New<ZoneList<CharacterRange>>(2, zone()); + CharacterClassStrings* strings = nullptr; + if (unicode_sets()) { + strings = zone()->template New<CharacterClassStrings>(zone()); + } + bool add_unicode_case_equivalents = ignore_case(); + bool parsed_character_class_escape = TryParseCharacterClassEscape( + next, InClassEscapeState::kNotInClass, ranges, strings, zone(), + add_unicode_case_equivalents CHECK_FAILED); + + if (parsed_character_class_escape) { + if (unicode_sets()) { + RegExpClassSetOperand* op = + zone()->template New<RegExpClassSetOperand>(ranges, + strings); + builder->AddTerm(op); + } else { + RegExpClassRanges* cc = + zone()->template New<RegExpClassRanges>(zone(), ranges); + builder->AddClassRanges(cc); + } + } else { + CHECK(!IsUnicodeMode()); + Advance(2); + builder->AddCharacter(next); // IdentityEscape. + } + break; + } + // AtomEscape :: + // k GroupName + case 'k': { + // Either an identity escape or a named back-reference. The two + // interpretations are mutually exclusive: '\k' is interpreted as + // an identity escape for non-Unicode patterns without named + // capture groups, and as the beginning of a named back-reference + // in all other cases. + const bool has_named_captures = + HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED); + if (IsUnicodeMode() || has_named_captures) { + Advance(2); + ParseNamedBackReference(builder, state CHECK_FAILED); + break; + } + } + V8_FALLTHROUGH; + // AtomEscape :: + // CharacterEscape + default: { + bool is_escaped_unicode_character = false; + base::uc32 c = ParseCharacterEscape( + InClassEscapeState::kNotInClass, + &is_escaped_unicode_character CHECK_FAILED); + if (is_escaped_unicode_character) { + builder->AddEscapedUnicodeCharacter(c); + } else { + builder->AddCharacter(c); + } + break; + } + } + break; + case '{': { + int dummy; + bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); + if (parsed) return ReportError(RegExpError::kNothingToRepeat); + V8_FALLTHROUGH; + } + case '}': + case ']': + if (IsUnicodeMode()) { + return ReportError(RegExpError::kLoneQuantifierBrackets); + } + V8_FALLTHROUGH; + default: + builder->AddUnicodeCharacter(current()); + Advance(); + break; + } // end switch(current()) + + int min; + int max; + switch (current()) { + // QuantifierPrefix :: + // * + // + + // ? + // { + case '*': + min = 0; + max = RegExpTree::kInfinity; + Advance(); + break; + case '+': + min = 1; + max = RegExpTree::kInfinity; + Advance(); + break; + case '?': + min = 0; + max = 1; + Advance(); + break; + case '{': + if (ParseIntervalQuantifier(&min, &max)) { + if (max < min) { + return ReportError(RegExpError::kRangeOutOfOrder); + } + break; + } else if (IsUnicodeMode()) { + // Incomplete quantifiers are not allowed. + return ReportError(RegExpError::kIncompleteQuantifier); + } + continue; + default: + continue; + } + RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; + if (current() == '?') { + quantifier_type = RegExpQuantifier::NON_GREEDY; + Advance(); + } else if (v8_flags.regexp_possessive_quantifier && current() == '+') { + // v8_flags.regexp_possessive_quantifier is a debug-only flag. + quantifier_type = RegExpQuantifier::POSSESSIVE; + Advance(); + } + if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { + return ReportError(RegExpError::kInvalidQuantifier); + } + } +} + +template <class CharT> +RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis( + RegExpParserState* state) { + RegExpLookaround::Type lookaround_type = state->lookaround_type(); + bool is_named_capture = false; + const ZoneVector<base::uc16>* capture_name = nullptr; + SubexpressionType subexpr_type = CAPTURE; + Advance(); + if (current() == '?') { + switch (Next()) { + case ':': + Advance(2); + subexpr_type = GROUPING; + break; + case '=': + Advance(2); + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = POSITIVE_LOOKAROUND; + break; + case '!': + Advance(2); + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = NEGATIVE_LOOKAROUND; + break; + case '<': + Advance(); + if (Next() == '=') { + Advance(2); + lookaround_type = RegExpLookaround::LOOKBEHIND; + subexpr_type = POSITIVE_LOOKAROUND; + break; + } else if (Next() == '!') { + Advance(2); + lookaround_type = RegExpLookaround::LOOKBEHIND; + subexpr_type = NEGATIVE_LOOKAROUND; + break; + } + is_named_capture = true; + has_named_captures_ = true; + Advance(); + break; + default: + ReportError(RegExpError::kInvalidGroup); + return nullptr; + } + } + if (subexpr_type == CAPTURE) { + if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) { + ReportError(RegExpError::kTooManyCaptures); + return nullptr; + } + captures_started_++; + + if (is_named_capture) { + capture_name = ParseCaptureGroupName(CHECK_FAILED); + } + } + // Store current state and begin new disjunction parsing. + return zone()->template New<RegExpParserState>( + state, subexpr_type, lookaround_type, captures_started_, capture_name, + state->builder()->flags(), zone()); +} + +// In order to know whether an escape is a backreference or not we have to scan +// the entire regexp and find the number of capturing parentheses. However we +// don't want to scan the regexp twice unless it is necessary. This mini-parser +// is called when needed. It can see the difference between capturing and +// noncapturing parentheses and can skip character classes and backslash-escaped +// characters. +// +// Important: The scanner has to be in a consistent state when calling +// ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while +// parsing a nested class. +template <class CharT> +void RegExpParserImpl<CharT>::ScanForCaptures( + InClassEscapeState in_class_escape_state) { + DCHECK(!is_scanned_for_captures_); + const int saved_position = position(); + // Start with captures started previous to current position + int capture_count = captures_started(); + // When we start inside a character class, skip everything inside the class. + if (in_class_escape_state == InClassEscapeState::kInClass) { + // \k is always invalid within a class in unicode mode, thus we should never + // call ScanForCaptures within a class. + DCHECK(!IsUnicodeMode()); + int c; + while ((c = current()) != kEndMarker) { + Advance(); + if (c == '\\') { + Advance(); + } else { + if (c == ']') break; + } + } + } + // Add count of captures after this position. + int n; + while ((n = current()) != kEndMarker) { + Advance(); + switch (n) { + case '\\': + Advance(); + break; + case '[': { + int class_nest_level = 0; + int c; + while ((c = current()) != kEndMarker) { + Advance(); + if (c == '\\') { + Advance(); + } else if (c == '[') { + // With /v, '[' inside a class is treated as a nested class. + // Without /v, '[' is a normal character. + if (unicode_sets()) class_nest_level++; + } else if (c == ']') { + if (class_nest_level == 0) break; + class_nest_level--; + } + } + break; + } + case '(': + if (current() == '?') { + // At this point we could be in + // * a non-capturing group '(:', + // * a lookbehind assertion '(?<=' '(?<!' + // * or a named capture '(?<'. + // + // Of these, only named captures are capturing groups. + + Advance(); + if (current() != '<') break; + + Advance(); + if (current() == '=' || current() == '!') break; + + // Found a possible named capture. It could turn out to be a syntax + // error (e.g. an unterminated or invalid name), but that distinction + // does not matter for our purposes. + has_named_captures_ = true; + } + capture_count++; + break; + } + } + capture_count_ = capture_count; + is_scanned_for_captures_ = true; + Reset(saved_position); +} + +template <class CharT> +bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) { + DCHECK_EQ('\\', current()); + DCHECK('1' <= Next() && Next() <= '9'); + // Try to parse a decimal literal that is no greater than the total number + // of left capturing parentheses in the input. + int start = position(); + int value = Next() - '0'; + Advance(2); + while (true) { + base::uc32 c = current(); + if (IsDecimalDigit(c)) { + value = 10 * value + (c - '0'); + if (value > RegExpMacroAssembler::kMaxCaptures) { + Reset(start); + return false; + } + Advance(); + } else { + break; + } + } + if (value > captures_started()) { + if (!is_scanned_for_captures_) { + ScanForCaptures(InClassEscapeState::kNotInClass); + } + if (value > capture_count_) { + Reset(start); + return false; + } + } + *index_out = value; + return true; +} + +namespace { + +void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) { + if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { + v->push_back(code_unit); + } else { + v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); + v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); + } +} + +} // namespace + +template <class CharT> +const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() { + // Due to special Advance requirements (see the next comment), rewind by one + // such that names starting with a surrogate pair are parsed correctly for + // patterns where the unicode flag is unset. + // + // Note that we use this odd pattern of rewinding the last advance in order + // to adhere to the common parser behavior of expecting `current` to point at + // the first candidate character for a function (e.g. when entering ParseFoo, + // `current` should point at the first character of Foo). + RewindByOneCodepoint(); + + ZoneVector<base::uc16>* name = + zone()->template New<ZoneVector<base::uc16>>(zone()); + + { + // Advance behavior inside this function is tricky since + // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U) + // and thus allows surrogate pairs and \u{}-style escapes even in + // non-unicode patterns. Therefore Advance within the capture group name + // has to force-enable unicode, and outside the name revert to default + // behavior. + ForceUnicodeScope force_unicode(this); + + bool at_start = true; + while (true) { + Advance(); + base::uc32 c = current(); + + // Convert unicode escapes. + if (c == '\\' && Next() == 'u') { + Advance(2); + if (!ParseUnicodeEscape(&c)) { + ReportError(RegExpError::kInvalidUnicodeEscape); + return nullptr; + } + RewindByOneCodepoint(); + } + + // The backslash char is misclassified as both ID_Start and ID_Continue. + if (c == '\\') { + ReportError(RegExpError::kInvalidCaptureGroupName); + return nullptr; + } + + if (at_start) { + if (!IsIdentifierStart(c)) { + ReportError(RegExpError::kInvalidCaptureGroupName); + return nullptr; + } + push_code_unit(name, c); + at_start = false; + } else { + if (c == '>') { + break; + } else if (IsIdentifierPart(c)) { + push_code_unit(name, c); + } else { + ReportError(RegExpError::kInvalidCaptureGroupName); + return nullptr; + } + } + } + } + + // This final advance goes back into the state of pointing at the next + // relevant char, which the rest of the parser expects. See also the previous + // comments in this function. + Advance(); + return name; +} + +template <class CharT> +bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex( + const ZoneVector<base::uc16>* name, int index) { + DCHECK(0 < index && index <= captures_started_); + DCHECK_NOT_NULL(name); + + RegExpCapture* capture = GetCapture(index); + DCHECK_NULL(capture->name()); + + capture->set_name(name); + + if (named_captures_ == nullptr) { + named_captures_ = + zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>( + zone()); + } else { + // Check for duplicates and bail if we find any. + + const auto& named_capture_it = named_captures_->find(capture); + if (named_capture_it != named_captures_->end()) { + ReportError(RegExpError::kDuplicateCaptureGroupName); + return false; + } + } + + named_captures_->emplace(capture); + + return true; +} + +template <class CharT> +bool RegExpParserImpl<CharT>::ParseNamedBackReference( + RegExpBuilder* builder, RegExpParserState* state) { + // The parser is assumed to be on the '<' in \k<name>. + if (current() != '<') { + ReportError(RegExpError::kInvalidNamedReference); + return false; + } + + Advance(); + const ZoneVector<base::uc16>* name = ParseCaptureGroupName(); + if (name == nullptr) { + return false; + } + + if (state->IsInsideCaptureGroup(name)) { + builder->AddEmpty(); + } else { + RegExpBackReference* atom = + zone()->template New<RegExpBackReference>(builder->flags()); + atom->set_name(name); + + builder->AddAtom(atom); + + if (named_back_references_ == nullptr) { + named_back_references_ = + zone()->template New<ZoneList<RegExpBackReference*>>(1, zone()); + } + named_back_references_->Add(atom, zone()); + } + + return true; +} + +template <class CharT> +void RegExpParserImpl<CharT>::PatchNamedBackReferences() { + if (named_back_references_ == nullptr) return; + + if (named_captures_ == nullptr) { + ReportError(RegExpError::kInvalidNamedCaptureReference); + return; + } + + // Look up and patch the actual capture for each named back reference. + + for (int i = 0; i < named_back_references_->length(); i++) { + RegExpBackReference* ref = named_back_references_->at(i); + + // Capture used to search the named_captures_ by name, index of the + // capture is never used. + static const int kInvalidIndex = 0; + RegExpCapture* search_capture = + zone()->template New<RegExpCapture>(kInvalidIndex); + DCHECK_NULL(search_capture->name()); + search_capture->set_name(ref->name()); + + int index = -1; + const auto& capture_it = named_captures_->find(search_capture); + if (capture_it != named_captures_->end()) { + index = (*capture_it)->index(); + } else { + ReportError(RegExpError::kInvalidNamedCaptureReference); + return; + } + + ref->set_capture(GetCapture(index)); + } +} + +template <class CharT> +RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) { + // The index for the capture groups are one-based. Its index in the list is + // zero-based. + const int known_captures = + is_scanned_for_captures_ ? capture_count_ : captures_started_; + DCHECK(index <= known_captures); + if (captures_ == nullptr) { + captures_ = + zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone()); + } + while (captures_->length() < known_captures) { + captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1), + zone()); + } + return captures_->at(index - 1); +} + +template <class CharT> +ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const { + if (named_captures_ == nullptr || named_captures_->empty()) { + return nullptr; + } + + return zone()->template New<ZoneVector<RegExpCapture*>>( + named_captures_->begin(), named_captures_->end(), zone()); +} + +template <class CharT> +bool RegExpParserImpl<CharT>::HasNamedCaptures( + InClassEscapeState in_class_escape_state) { + if (has_named_captures_ || is_scanned_for_captures_) { + return has_named_captures_; + } + + ScanForCaptures(in_class_escape_state); + DCHECK(is_scanned_for_captures_); + return has_named_captures_; +} + +// QuantifierPrefix :: +// { DecimalDigits } +// { DecimalDigits , } +// { DecimalDigits , DecimalDigits } +// +// Returns true if parsing succeeds, and set the min_out and max_out +// values. Values are truncated to RegExpTree::kInfinity if they overflow. +template <class CharT> +bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out, + int* max_out) { + DCHECK_EQ(current(), '{'); + int start = position(); + Advance(); + int min = 0; + if (!IsDecimalDigit(current())) { + Reset(start); + return false; + } + while (IsDecimalDigit(current())) { + int next = current() - '0'; + if (min > (RegExpTree::kInfinity - next) / 10) { + // Overflow. Skip past remaining decimal digits and return -1. + do { + Advance(); + } while (IsDecimalDigit(current())); + min = RegExpTree::kInfinity; + break; + } + min = 10 * min + next; + Advance(); + } + int max = 0; + if (current() == '}') { + max = min; + Advance(); + } else if (current() == ',') { + Advance(); + if (current() == '}') { + max = RegExpTree::kInfinity; + Advance(); + } else { + while (IsDecimalDigit(current())) { + int next = current() - '0'; + if (max > (RegExpTree::kInfinity - next) / 10) { + do { + Advance(); + } while (IsDecimalDigit(current())); + max = RegExpTree::kInfinity; + break; + } + max = 10 * max + next; + Advance(); + } + if (current() != '}') { + Reset(start); + return false; + } + Advance(); + } + } else { + Reset(start); + return false; + } + *min_out = min; + *max_out = max; + return true; +} + +template <class CharT> +base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() { + DCHECK(('0' <= current() && current() <= '7') || !has_more()); + // For compatibility with some other browsers (not all), we parse + // up to three octal digits with a value below 256. + // ES#prod-annexB-LegacyOctalEscapeSequence + base::uc32 value = current() - '0'; + Advance(); + if ('0' <= current() && current() <= '7') { + value = value * 8 + current() - '0'; + Advance(); + if (value < 32 && '0' <= current() && current() <= '7') { + value = value * 8 + current() - '0'; + Advance(); + } + } + return value; +} + +template <class CharT> +bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) { + int start = position(); + base::uc32 val = 0; + for (int i = 0; i < length; ++i) { + base::uc32 c = current(); + int d = base::HexValue(c); + if (d < 0) { + Reset(start); + return false; + } + val = val * 16 + d; + Advance(); + } + *value = val; + return true; +} + +// This parses RegExpUnicodeEscapeSequence as described in ECMA262. +template <class CharT> +bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) { + // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are + // allowed). In the latter case, the number of hex digits between { } is + // arbitrary. \ and u have already been read. + if (current() == '{' && IsUnicodeMode()) { + int start = position(); + Advance(); + if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { + if (current() == '}') { + Advance(); + return true; + } + } + Reset(start); + return false; + } + // \u but no {, or \u{...} escapes not allowed. + bool result = ParseHexEscape(4, value); + if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) && + current() == '\\') { + // Attempt to read trail surrogate. + int start = position(); + if (Next() == 'u') { + Advance(2); + base::uc32 trail; + if (ParseHexEscape(4, &trail) && + unibrow::Utf16::IsTrailSurrogate(trail)) { + *value = unibrow::Utf16::CombineSurrogatePair( + static_cast<base::uc16>(*value), static_cast<base::uc16>(trail)); + return true; + } + } + Reset(start); + } + return result; +} + +#ifdef V8_INTL_SUPPORT + +namespace { + +bool IsExactPropertyAlias(const char* property_name, UProperty property) { + const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); + if (short_name != nullptr && strcmp(property_name, short_name) == 0) + return true; + for (int i = 0;; i++) { + const char* long_name = u_getPropertyName( + property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); + if (long_name == nullptr) break; + if (strcmp(property_name, long_name) == 0) return true; + } + return false; +} + +bool IsExactPropertyValueAlias(const char* property_value_name, + UProperty property, int32_t property_value) { + const char* short_name = + u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); + if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { + return true; + } + for (int i = 0;; i++) { + const char* long_name = u_getPropertyValueName( + property, property_value, + static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); + if (long_name == nullptr) break; + if (strcmp(property_value_name, long_name) == 0) return true; + } + return false; +} + +void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set, + CharacterClassStrings* strings, + RegExpFlags flags, Zone* zone) { + DCHECK(set.hasStrings()); + DCHECK(IsUnicodeSets(flags)); + DCHECK_NOT_NULL(strings); + + RegExpTextBuilder::SmallRegExpTreeVector string_storage( + ZoneAllocator<RegExpTree*>{zone}); + RegExpTextBuilder string_builder(zone, &string_storage, flags); + const bool needs_case_folding = IsIgnoreCase(flags); + icu::UnicodeSetIterator iter(set); + iter.skipToStrings(); + while (iter.next()) { + const icu::UnicodeString& s = iter.getString(); + const char16_t* p = s.getBuffer(); + int32_t length = s.length(); + ZoneList<base::uc32>* string = + zone->template New<ZoneList<base::uc32>>(length, zone); + for (int32_t i = 0; i < length;) { + UChar32 c; + U16_NEXT(p, i, length, c); + string_builder.AddUnicodeCharacter(c); + if (needs_case_folding) { + c = u_foldCase(c, U_FOLD_CASE_DEFAULT); + } + string->Add(c, zone); + } + strings->emplace(string->ToVector(), string_builder.ToRegExp()); + string_storage.clear(); + } +} + +bool LookupPropertyValueName(UProperty property, + const char* property_value_name, bool negate, + ZoneList<CharacterRange>* result_ranges, + CharacterClassStrings* result_strings, + RegExpFlags flags, Zone* zone) { + UProperty property_for_lookup = property; + if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { + // For the property Script_Extensions, we have to do the property value + // name lookup as if the property is Script. + property_for_lookup = UCHAR_SCRIPT; + } + int32_t property_value = + u_getPropertyValueEnum(property_for_lookup, property_value_name); + if (property_value == UCHAR_INVALID_CODE) return false; + + // We require the property name to match exactly to one of the property value + // aliases. However, u_getPropertyValueEnum uses loose matching. + if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, + property_value)) { + return false; + } + + UErrorCode ec = U_ZERO_ERROR; + icu::UnicodeSet set; + set.applyIntPropertyValue(property, property_value, ec); + bool success = ec == U_ZERO_ERROR && !set.isEmpty(); + + if (success) { + if (set.hasStrings()) { + ExtractStringsFromUnicodeSet(set, result_strings, flags, zone); + } + const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags); + if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set); + set.removeAllStrings(); + if (negate) set.complement(); + for (int i = 0; i < set.getRangeCount(); i++) { + result_ranges->Add( + CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), + zone); + } + } + return success; +} + +template <size_t N> +inline bool NameEquals(const char* name, const char (&literal)[N]) { + return strncmp(name, literal, N + 1) == 0; +} + +bool LookupSpecialPropertyValueName(const char* name, + ZoneList<CharacterRange>* result, + bool negate, RegExpFlags flags, + Zone* zone) { + if (NameEquals(name, "Any")) { + if (negate) { + // Leave the list of character ranges empty, since the negation of 'Any' + // is the empty set. + } else { + result->Add(CharacterRange::Everything(), zone); + } + } else if (NameEquals(name, "ASCII")) { + result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) + : CharacterRange::Range(0x0, 0x7F), + zone); + } else if (NameEquals(name, "Assigned")) { + return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", + !negate, result, nullptr, flags, zone); + } else { + return false; + } + return true; +} + +// Explicitly allowlist supported binary properties. The spec forbids supporting +// properties outside of this set to ensure interoperability. +bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) { + switch (property) { + case UCHAR_ALPHABETIC: + // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. + // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. + case UCHAR_ASCII_HEX_DIGIT: + // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. + case UCHAR_BIDI_CONTROL: + case UCHAR_BIDI_MIRRORED: + case UCHAR_CASE_IGNORABLE: + case UCHAR_CASED: + case UCHAR_CHANGES_WHEN_CASEFOLDED: + case UCHAR_CHANGES_WHEN_CASEMAPPED: + case UCHAR_CHANGES_WHEN_LOWERCASED: + case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: + case UCHAR_CHANGES_WHEN_TITLECASED: + case UCHAR_CHANGES_WHEN_UPPERCASED: + case UCHAR_DASH: + case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: + case UCHAR_DEPRECATED: + case UCHAR_DIACRITIC: + case UCHAR_EMOJI: + case UCHAR_EMOJI_COMPONENT: + case UCHAR_EMOJI_MODIFIER_BASE: + case UCHAR_EMOJI_MODIFIER: + case UCHAR_EMOJI_PRESENTATION: + case UCHAR_EXTENDED_PICTOGRAPHIC: + case UCHAR_EXTENDER: + case UCHAR_GRAPHEME_BASE: + case UCHAR_GRAPHEME_EXTEND: + case UCHAR_HEX_DIGIT: + case UCHAR_ID_CONTINUE: + case UCHAR_ID_START: + case UCHAR_IDEOGRAPHIC: + case UCHAR_IDS_BINARY_OPERATOR: + case UCHAR_IDS_TRINARY_OPERATOR: + case UCHAR_JOIN_CONTROL: + case UCHAR_LOGICAL_ORDER_EXCEPTION: + case UCHAR_LOWERCASE: + case UCHAR_MATH: + case UCHAR_NONCHARACTER_CODE_POINT: + case UCHAR_PATTERN_SYNTAX: + case UCHAR_PATTERN_WHITE_SPACE: + case UCHAR_QUOTATION_MARK: + case UCHAR_RADICAL: + case UCHAR_REGIONAL_INDICATOR: + case UCHAR_S_TERM: + case UCHAR_SOFT_DOTTED: + case UCHAR_TERMINAL_PUNCTUATION: + case UCHAR_UNIFIED_IDEOGRAPH: + case UCHAR_UPPERCASE: + case UCHAR_VARIATION_SELECTOR: + case UCHAR_WHITE_SPACE: + case UCHAR_XID_CONTINUE: + case UCHAR_XID_START: + return true; + case UCHAR_BASIC_EMOJI: + case UCHAR_EMOJI_KEYCAP_SEQUENCE: + case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: + case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: + case UCHAR_RGI_EMOJI_TAG_SEQUENCE: + case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: + case UCHAR_RGI_EMOJI: + return unicode_sets; + default: + break; + } + return false; +} + +bool IsBinaryPropertyOfStrings(UProperty property) { + switch (property) { + case UCHAR_BASIC_EMOJI: + case UCHAR_EMOJI_KEYCAP_SEQUENCE: + case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: + case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: + case UCHAR_RGI_EMOJI_TAG_SEQUENCE: + case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: + case UCHAR_RGI_EMOJI: + return true; + default: + break; + } + return false; +} + +bool IsUnicodePropertyValueCharacter(char c) { + // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ + // + // Note that using this to validate each parsed char is quite conservative. + // A possible alternative solution would be to only ensure the parsed + // property name/value candidate string does not contain '\0' characters and + // let ICU lookups trigger the final failure. + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + if ('0' <= c && c <= '9') return true; + return (c == '_'); +} + +} // namespace + +template <class CharT> +bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2) { + DCHECK(name_1->empty()); + DCHECK(name_2->empty()); + // Parse the property class as follows: + // - In \p{name}, 'name' is interpreted + // - either as a general category property value name. + // - or as a binary property name. + // - In \p{name=value}, 'name' is interpreted as an enumerated property name, + // and 'value' is interpreted as one of the available property value names. + // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. + // - Loose matching is not applied. + if (current() == '{') { + // Parse \p{[PropertyName=]PropertyNameValue} + for (Advance(); current() != '}' && current() != '='; Advance()) { + if (!IsUnicodePropertyValueCharacter(current())) return false; + if (!has_next()) return false; + name_1->push_back(static_cast<char>(current())); + } + if (current() == '=') { + for (Advance(); current() != '}'; Advance()) { + if (!IsUnicodePropertyValueCharacter(current())) return false; + if (!has_next()) return false; + name_2->push_back(static_cast<char>(current())); + } + name_2->push_back(0); // null-terminate string. + } + } else { + return false; + } + Advance(); + name_1->push_back(0); // null-terminate string. + + DCHECK(name_1->size() - 1 == std::strlen(name_1->data())); + DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); + return true; +} + +template <class CharT> +bool RegExpParserImpl<CharT>::AddPropertyClassRange( + ZoneList<CharacterRange>* add_to_ranges, + CharacterClassStrings* add_to_strings, bool negate, + const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { + if (name_2.empty()) { + // First attempt to interpret as general category property value name. + const char* name = name_1.data(); + if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, + add_to_ranges, add_to_strings, flags(), + zone())) { + return true; + } + // Interpret "Any", "ASCII", and "Assigned". + if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(), + zone())) { + return true; + } + // Then attempt to interpret as binary property name with value name 'Y'. + UProperty property = u_getPropertyEnum(name); + if (!IsSupportedBinaryProperty(property, unicode_sets())) return false; + if (!IsExactPropertyAlias(name, property)) return false; + // Negation of properties with strings is not allowed. + // TODO(v8:11935): Change permalink once proposal is in stage 4. + // See + // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#sec-static-semantics-maycontainstrings + if (negate && IsBinaryPropertyOfStrings(property)) return false; + return LookupPropertyValueName(property, negate ? "N" : "Y", false, + add_to_ranges, add_to_strings, flags(), + zone()); + } else { + // Both property name and value name are specified. Attempt to interpret + // the property name as enumerated property. + const char* property_name = name_1.data(); + const char* value_name = name_2.data(); + UProperty property = u_getPropertyEnum(property_name); + if (!IsExactPropertyAlias(property_name, property)) return false; + if (property == UCHAR_GENERAL_CATEGORY) { + // We want to allow aggregate value names such as "Letter". + property = UCHAR_GENERAL_CATEGORY_MASK; + } else if (property != UCHAR_SCRIPT && + property != UCHAR_SCRIPT_EXTENSIONS) { + return false; + } + return LookupPropertyValueName(property, value_name, negate, add_to_ranges, + add_to_strings, flags(), zone()); + } +} + +#else // V8_INTL_SUPPORT + +template <class CharT> +bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2) { + return false; +} + +template <class CharT> +bool RegExpParserImpl<CharT>::AddPropertyClassRange( + ZoneList<CharacterRange>* add_to_ranges, + CharacterClassStrings* add_to_strings, bool negate, + const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { + return false; +} + +#endif // V8_INTL_SUPPORT + +template <class CharT> +bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value, + base::uc32* value) { + base::uc32 x = 0; + int d = base::HexValue(current()); + if (d < 0) { + return false; + } + while (d >= 0) { + x = x * 16 + d; + if (x > static_cast<base::uc32>(max_value)) { + return false; + } + Advance(); + d = base::HexValue(current()); + } + *value = x; + return true; +} + +// https://tc39.es/ecma262/#prod-CharacterEscape +template <class CharT> +base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape( + InClassEscapeState in_class_escape_state, + bool* is_escaped_unicode_character) { + DCHECK_EQ('\\', current()); + DCHECK(has_next()); + + Advance(); + + const base::uc32 c = current(); + switch (c) { + // CharacterEscape :: + // ControlEscape :: one of + // f n r t v + case 'f': + Advance(); + return '\f'; + case 'n': + Advance(); + return '\n'; + case 'r': + Advance(); + return '\r'; + case 't': + Advance(); + return '\t'; + case 'v': + Advance(); + return '\v'; + // CharacterEscape :: + // c ControlLetter + case 'c': { + base::uc32 controlLetter = Next(); + base::uc32 letter = controlLetter & ~('A' ^ 'a'); + if (letter >= 'A' && letter <= 'Z') { + Advance(2); + // Control letters mapped to ASCII control characters in the range + // 0x00-0x1F. + return controlLetter & 0x1F; + } + if (IsUnicodeMode()) { + // With /u and /v, invalid escapes are not treated as identity escapes. + ReportError(RegExpError::kInvalidUnicodeEscape); + return 0; + } + if (in_class_escape_state == InClassEscapeState::kInClass) { + // Inside a character class, we also accept digits and underscore as + // control characters, unless with /u or /v. See Annex B: + // ES#prod-annexB-ClassControlLetter + if ((controlLetter >= '0' && controlLetter <= '9') || + controlLetter == '_') { + Advance(2); + return controlLetter & 0x1F; + } + } + // We match JSC in reading the backslash as a literal + // character instead of as starting an escape. + return '\\'; + } + // CharacterEscape :: + // 0 [lookahead ∉ DecimalDigit] + // [~UnicodeMode] LegacyOctalEscapeSequence + case '0': + // \0 is interpreted as NUL if not followed by another digit. + if (Next() < '0' || Next() > '9') { + Advance(); + return 0; + } + V8_FALLTHROUGH; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // For compatibility, we interpret a decimal escape that isn't + // a back reference (and therefore either \0 or not valid according + // to the specification) as a 1..3 digit octal character code. + // ES#prod-annexB-LegacyOctalEscapeSequence + if (IsUnicodeMode()) { + // With /u or /v, decimal escape is not interpreted as octal character + // code. + ReportError(RegExpError::kInvalidClassEscape); + return 0; + } + return ParseOctalLiteral(); + // CharacterEscape :: + // HexEscapeSequence + case 'x': { + Advance(); + base::uc32 value; + if (ParseHexEscape(2, &value)) return value; + if (IsUnicodeMode()) { + // With /u or /v, invalid escapes are not treated as identity escapes. + ReportError(RegExpError::kInvalidEscape); + return 0; + } + // If \x is not followed by a two-digit hexadecimal, treat it + // as an identity escape. + return 'x'; + } + // CharacterEscape :: + // RegExpUnicodeEscapeSequence [?UnicodeMode] + case 'u': { + Advance(); + base::uc32 value; + if (ParseUnicodeEscape(&value)) { + *is_escaped_unicode_character = true; + return value; + } + if (IsUnicodeMode()) { + // With /u or /v, invalid escapes are not treated as identity escapes. + ReportError(RegExpError::kInvalidUnicodeEscape); + return 0; + } + // If \u is not followed by a two-digit hexadecimal, treat it + // as an identity escape. + return 'u'; + } + default: + break; + } + + // CharacterEscape :: + // IdentityEscape[?UnicodeMode, ?N] + // + // * With /u, no identity escapes except for syntax characters are + // allowed. + // * With /v, no identity escapes except for syntax characters and + // ClassSetReservedPunctuators (if within a class) are allowed. + // * Without /u or /v: + // * '\c' is not an IdentityEscape. + // * '\k' is not an IdentityEscape when named captures exist. + // * Otherwise, all identity escapes are allowed. + if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) { + if (IsClassSetReservedPunctuator(c)) { + Advance(); + return c; + } + } + if (IsUnicodeMode()) { + if (!IsSyntaxCharacterOrSlash(c)) { + ReportError(RegExpError::kInvalidEscape); + return 0; + } + Advance(); + return c; + } + DCHECK(!IsUnicodeMode()); + if (c == 'c') { + ReportError(RegExpError::kInvalidEscape); + return 0; + } + Advance(); + // Note: It's important to Advance before the HasNamedCaptures call s.t. we + // don't start scanning in the middle of an escape. + if (c == 'k' && HasNamedCaptures(in_class_escape_state)) { + ReportError(RegExpError::kInvalidEscape); + return 0; + } + return c; +} + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassRanges +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges( + ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) { + base::uc32 char_1, char_2; + bool is_class_1, is_class_2; + while (has_more() && current() != ']') { + ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, + &is_class_1 CHECK_FAILED); + // ClassAtom + if (current() == '-') { + Advance(); + if (!has_more()) { + // If we reach the end we break out of the loop and let the + // following code report an error. + break; + } else if (current() == ']') { + if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); + ranges->Add(CharacterRange::Singleton('-'), zone()); + break; + } + ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, + &is_class_2 CHECK_FAILED); + if (is_class_1 || is_class_2) { + // Either end is an escaped character class. Treat the '-' verbatim. + if (IsUnicodeMode()) { + // ES2015 21.2.2.15.1 step 1. + return ReportError(RegExpError::kInvalidCharacterClass); + } + if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); + ranges->Add(CharacterRange::Singleton('-'), zone()); + if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); + continue; + } + // ES2015 21.2.2.15.1 step 6. + if (char_1 > char_2) { + return ReportError(RegExpError::kOutOfOrderCharacterClass); + } + ranges->Add(CharacterRange::Range(char_1, char_2), zone()); + } else { + if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); + } + } + return nullptr; +} + +// https://tc39.es/ecma262/#prod-ClassEscape +template <class CharT> +void RegExpParserImpl<CharT>::ParseClassEscape( + ZoneList<CharacterRange>* ranges, Zone* zone, + bool add_unicode_case_equivalents, base::uc32* char_out, + bool* is_class_escape) { + *is_class_escape = false; + + if (current() != '\\') { + // Not a ClassEscape. + *char_out = current(); + Advance(); + return; + } + + const base::uc32 next = Next(); + switch (next) { + case 'b': + *char_out = '\b'; + Advance(2); + return; + case '-': + if (IsUnicodeMode()) { + *char_out = next; + Advance(2); + return; + } + break; + case kEndMarker: + ReportError(RegExpError::kEscapeAtEndOfPattern); + return; + default: + break; + } + + static constexpr InClassEscapeState kInClassEscape = + InClassEscapeState::kInClass; + *is_class_escape = + TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone, + add_unicode_case_equivalents); + if (*is_class_escape) return; + + bool dummy = false; // Unused. + *char_out = ParseCharacterEscape(kInClassEscape, &dummy); +} + +// https://tc39.es/ecma262/#prod-CharacterClassEscape +template <class CharT> +bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape( + base::uc32 next, InClassEscapeState in_class_escape_state, + ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings, + Zone* zone, bool add_unicode_case_equivalents) { + DCHECK_EQ(current(), '\\'); + DCHECK_EQ(Next(), next); + + switch (next) { + case 'd': + case 'D': + case 's': + case 'S': + case 'w': + case 'W': + CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next), + ranges, add_unicode_case_equivalents, + zone); + Advance(2); + return true; + case 'p': + case 'P': { + if (!IsUnicodeMode()) return false; + bool negate = next == 'P'; + Advance(2); + ZoneVector<char> name_1(zone); + ZoneVector<char> name_2(zone); + if (!ParsePropertyClassName(&name_1, &name_2) || + !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) { + ReportError(in_class_escape_state == InClassEscapeState::kInClass + ? RegExpError::kInvalidClassPropertyName + : RegExpError::kInvalidPropertyName); + } + return true; + } + default: + return false; + } +} + +namespace { + +// Add |string| to |ranges| if length of |string| == 1, otherwise add |string| +// to |strings|. +void AddClassString(ZoneList<base::uc32>* normalized_string, + RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings, Zone* zone) { + if (normalized_string->length() == 1) { + ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone); + } else { + strings->emplace(normalized_string->ToVector(), regexp_string); + } +} + +} // namespace + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassStringDisjunction +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction( + ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) { + DCHECK(unicode_sets()); + DCHECK_EQ(current(), '\\'); + DCHECK_EQ(Next(), 'q'); + Advance(2); + if (current() != '{') { + // Identity escape of 'q' is not allowed in unicode mode. + return ReportError(RegExpError::kInvalidEscape); + } + Advance(); + + ZoneList<base::uc32>* string = + zone()->template New<ZoneList<base::uc32>>(4, zone()); + RegExpTextBuilder::SmallRegExpTreeVector string_storage( + ZoneAllocator<RegExpTree*>{zone()}); + RegExpTextBuilder string_builder(zone(), &string_storage, flags()); + + while (has_more() && current() != '}') { + if (current() == '|') { + AddClassString(string, string_builder.ToRegExp(), ranges, strings, + zone()); + string = zone()->template New<ZoneList<base::uc32>>(4, zone()); + string_storage.clear(); + Advance(); + } else { + base::uc32 c = ParseClassSetCharacter(CHECK_FAILED); + if (ignore_case()) { +#ifdef V8_INTL_SUPPORT + c = u_foldCase(c, U_FOLD_CASE_DEFAULT); +#else + c = AsciiAlphaToLower(c); +#endif + } + string->Add(c, zone()); + string_builder.AddUnicodeCharacter(c); + } + } + + AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone()); + CharacterRange::Canonicalize(ranges); + + // We don't need to handle missing closing '}' here. + // If the character class is correctly closed, ParseClassSetCharacter will + // report an error. + Advance(); + return nullptr; +} + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand +// Tree returned based on type_out: +// * kNestedClass: RegExpClassSetExpression +// * For all other types: RegExpClassSetOperand +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand( + const RegExpBuilder* builder, ClassSetOperandType* type_out) { + ZoneList<CharacterRange>* ranges = + zone()->template New<ZoneList<CharacterRange>>(1, zone()); + CharacterClassStrings* strings = + zone()->template New<CharacterClassStrings>(zone()); + RegExpTree* tree = + ParseClassSetOperand(builder, type_out, ranges, strings CHECK_FAILED); + DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass, + tree == nullptr); + DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, + ranges->length() == 1); + DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter, + strings->empty()); + DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, + ranges->is_empty()); + DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, + strings->empty()); + DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass, + tree->IsClassSetExpression()); + // ClassSetRange is only used within ClassSetUnion(). + DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange); + // There are no restrictions for kCharacterClassEscape. + // CharacterClassEscape includes \p{}, which can contain ranges, strings or + // both and \P{}, which could contain nothing (i.e. \P{Any}). + if (tree == nullptr) { + tree = zone()->template New<RegExpClassSetOperand>(ranges, strings); + } + return tree; +} + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetOperand +// Based on |type_out| either a tree is returned or ranges/strings modified. +// If a tree is returned, ranges/strings are not modified. +// If |type_out| is kNestedClass, a tree of type RegExpClassSetExpression is +// returned. For all other types, ranges is modified and nullptr is returned. +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand( + const RegExpBuilder* builder, ClassSetOperandType* type_out, + ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) { + DCHECK(unicode_sets()); + base::uc32 c = current(); + if (c == '\\') { + const base::uc32 next = Next(); + if (next == 'q') { + *type_out = ClassSetOperandType::kClassStringDisjunction; + ParseClassStringDisjunction(ranges, strings CHECK_FAILED); + return nullptr; + } + static constexpr InClassEscapeState kInClassEscape = + InClassEscapeState::kInClass; + const bool add_unicode_case_equivalents = ignore_case(); + if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings, + zone(), add_unicode_case_equivalents)) { + *type_out = ClassSetOperandType::kCharacterClassEscape; + return nullptr; + } + } + + if (c == '[') { + *type_out = ClassSetOperandType::kNestedClass; + return ParseCharacterClass(builder); + } + + *type_out = ClassSetOperandType::kClassSetCharacter; + c = ParseClassSetCharacter(CHECK_FAILED); + ranges->Add(CharacterRange::Singleton(c), zone()); + return nullptr; +} + +template <class CharT> +base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() { + DCHECK(unicode_sets()); + const base::uc32 c = current(); + if (c == '\\') { + const base::uc32 next = Next(); + switch (next) { + case 'b': + Advance(2); + return '\b'; + case kEndMarker: + ReportError(RegExpError::kEscapeAtEndOfPattern); + return 0; + } + static constexpr InClassEscapeState kInClassEscape = + InClassEscapeState::kInClass; + + bool dummy = false; // Unused. + return ParseCharacterEscape(kInClassEscape, &dummy); + } + if (IsClassSetSyntaxCharacter(c)) { + ReportError(RegExpError::kInvalidCharacterInClass); + return 0; + } + if (IsClassSetReservedDoublePunctuator(c)) { + ReportError(RegExpError::kInvalidClassSetOperation); + return 0; + } + Advance(); + return c; +} + +namespace { + +bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) { + switch (type) { + case ClassSetOperandType::kClassSetCharacter: + case ClassSetOperandType::kClassSetRange: + return false; + case ClassSetOperandType::kCharacterClassEscape: + case ClassSetOperandType::kClassStringDisjunction: + return operand->AsClassSetOperand()->has_strings(); + case ClassSetOperandType::kNestedClass: + if (operand->IsClassRanges()) return false; + return operand->AsClassSetExpression()->may_contain_strings(); + } +} + +} // namespace + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassUnion +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion( + const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, + ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges, + CharacterClassStrings* strings) { + DCHECK(unicode_sets()); + ZoneList<RegExpTree*>* operands = + zone()->template New<ZoneList<RegExpTree*>>(2, zone()); + bool may_contain_strings = false; + // Add the lhs to operands if necessary. + // Either the lhs values were added to |ranges|/|strings| (in which case + // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed + // as |first_operand| (in which case |ranges| and |strings| are empty). + if (first_operand != nullptr) { + may_contain_strings = MayContainStrings(first_operand_type, first_operand); + operands->Add(first_operand, zone()); + } + ClassSetOperandType last_type = first_operand_type; + const bool needs_case_folding = ignore_case(); + while (has_more() && current() != ']') { + if (current() == '-') { + // Mix of ClassSetRange and ClassSubtraction is not allowed. + if (Next() == '-') { + return ReportError(RegExpError::kInvalidClassSetOperation); + } + Advance(); + if (!has_more()) { + // If we reach the end we break out of the loop and let the + // following code report an error. + break; + } + // If the lhs and rhs around '-' are both ClassSetCharacters, they + // represent a character range. + // In case one of them is not a ClassSetCharacter, it is a syntax error, + // as '-' can not be used unescaped within a class with /v. + // TODO(v8:11935): Change permalink once proposal is in stage 4. + // See + // https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSetRange + if (last_type != ClassSetOperandType::kClassSetCharacter) { + return ReportError(RegExpError::kInvalidCharacterClass); + } + ParseClassSetOperand(builder, &last_type, ranges, strings CHECK_FAILED); + if (last_type != ClassSetOperandType::kClassSetCharacter) { + return ReportError(RegExpError::kInvalidCharacterClass); + } + // Remove the last two singleton characters added to ranges, and combine + // them into a range. + auto rhs_ranges = ranges->RemoveLast(); + auto lhs_ranges = ranges->RemoveLast(); + DCHECK(lhs_ranges.IsSingleton()); + DCHECK(rhs_ranges.IsSingleton()); + base::uc32 from = lhs_ranges.from(); + base::uc32 to = rhs_ranges.from(); + if (from > to) { + return ReportError(RegExpError::kOutOfOrderCharacterClass); + } + ranges->Add(CharacterRange::Range(from, to), zone()); + last_type = ClassSetOperandType::kClassSetRange; + } else { + DCHECK_NE(current(), '-'); + RegExpTree* operand = ParseClassSetOperand(builder, &last_type, ranges, + strings CHECK_FAILED); + if (operand != nullptr) { + may_contain_strings |= MayContainStrings(last_type, operand); + // Add the range we started building as operand and reset the current + // range. + if (!ranges->is_empty() || !strings->empty()) { + if (needs_case_folding) { + CharacterRange::Canonicalize(ranges); + CharacterRange::AddUnicodeCaseEquivalents(ranges, zone()); + } + may_contain_strings |= !strings->empty(); + operands->Add( + zone()->template New<RegExpClassSetOperand>(ranges, strings), + zone()); + ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone()); + strings = zone()->template New<CharacterClassStrings>(zone()); + } + operands->Add(operand, zone()); + } + } + } + + if (!has_more()) { + return ReportError(RegExpError::kUnterminatedCharacterClass); + } + + // Add the range we started building as operand. + if (!ranges->is_empty() || !strings->empty()) { + if (needs_case_folding) { + CharacterRange::Canonicalize(ranges); + CharacterRange::AddUnicodeCaseEquivalents(ranges, zone()); + } + may_contain_strings |= !strings->empty(); + operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings), + zone()); + } + + DCHECK_EQ(current(), ']'); + Advance(); + + if (is_negated && may_contain_strings) { + return ReportError(RegExpError::kNegatedCharacterClassWithStrings); + } + + if (operands->is_empty()) { + // Return empty expression if no operands were added (e.g. [\P{Any}] + // produces an empty range). + DCHECK(ranges->is_empty()); + DCHECK(strings->empty()); + return RegExpClassSetExpression::Empty(zone(), is_negated); + } + + return zone()->template New<RegExpClassSetExpression>( + RegExpClassSetExpression::OperationType::kUnion, is_negated, + may_contain_strings, operands); +} + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassIntersection +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection( + const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, + ClassSetOperandType first_operand_type) { + DCHECK(unicode_sets()); + DCHECK(current() == '&' && Next() == '&'); + bool may_contain_strings = + MayContainStrings(first_operand_type, first_operand); + ZoneList<RegExpTree*>* operands = + zone()->template New<ZoneList<RegExpTree*>>(2, zone()); + operands->Add(first_operand, zone()); + while (has_more() && current() != ']') { + if (current() != '&' || Next() != '&') { + return ReportError(RegExpError::kInvalidClassSetOperation); + } + Advance(2); + // [lookahead ≠&] + if (current() == '&') { + return ReportError(RegExpError::kInvalidCharacterInClass); + } + + ClassSetOperandType operand_type; + RegExpTree* operand = + ParseClassSetOperand(builder, &operand_type CHECK_FAILED); + may_contain_strings &= MayContainStrings(operand_type, operand); + operands->Add(operand, zone()); + } + if (!has_more()) { + return ReportError(RegExpError::kUnterminatedCharacterClass); + } + if (is_negated && may_contain_strings) { + return ReportError(RegExpError::kNegatedCharacterClassWithStrings); + } + DCHECK_EQ(current(), ']'); + Advance(); + return zone()->template New<RegExpClassSetExpression>( + RegExpClassSetExpression::OperationType::kIntersection, is_negated, + may_contain_strings, operands); +} + +// TODO(v8:11935): Change permalink once proposal is in stage 4. +// https://arai-a.github.io/ecma262-compare/snapshot.html?pr=2418#prod-ClassSubtraction +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction( + const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand, + ClassSetOperandType first_operand_type) { + DCHECK(unicode_sets()); + DCHECK(current() == '-' && Next() == '-'); + const bool may_contain_strings = + MayContainStrings(first_operand_type, first_operand); + if (is_negated && may_contain_strings) { + return ReportError(RegExpError::kNegatedCharacterClassWithStrings); + } + ZoneList<RegExpTree*>* operands = + zone()->template New<ZoneList<RegExpTree*>>(2, zone()); + operands->Add(first_operand, zone()); + while (has_more() && current() != ']') { + if (current() != '-' || Next() != '-') { + return ReportError(RegExpError::kInvalidClassSetOperation); + } + Advance(2); + ClassSetOperandType dummy; // unused + RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED); + operands->Add(operand, zone()); + } + if (!has_more()) { + return ReportError(RegExpError::kUnterminatedCharacterClass); + } + DCHECK_EQ(current(), ']'); + Advance(); + return zone()->template New<RegExpClassSetExpression>( + RegExpClassSetExpression::OperationType::kSubtraction, is_negated, + may_contain_strings, operands); +} + +// https://tc39.es/ecma262/#prod-CharacterClass +template <class CharT> +RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass( + const RegExpBuilder* builder) { + DCHECK_EQ(current(), '['); + Advance(); + bool is_negated = false; + if (current() == '^') { + is_negated = true; + Advance(); + } + ZoneList<CharacterRange>* ranges = + zone()->template New<ZoneList<CharacterRange>>(2, zone()); + if (current() == ']') { + Advance(); + if (unicode_sets()) { + return RegExpClassSetExpression::Empty(zone(), is_negated); + } else { + RegExpClassRanges::ClassRangesFlags class_ranges_flags; + if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED; + return zone()->template New<RegExpClassRanges>(zone(), ranges, + class_ranges_flags); + } + } + + if (!unicode_sets()) { + bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case(); + ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED); + if (!has_more()) { + return ReportError(RegExpError::kUnterminatedCharacterClass); + } + DCHECK_EQ(current(), ']'); + Advance(); + RegExpClassRanges::ClassRangesFlags character_class_flags; + if (is_negated) character_class_flags = RegExpClassRanges::NEGATED; + return zone()->template New<RegExpClassRanges>(zone(), ranges, + character_class_flags); + } else { + ClassSetOperandType operand_type; + CharacterClassStrings* strings = + zone()->template New<CharacterClassStrings>(zone()); + RegExpTree* operand = ParseClassSetOperand(builder, &operand_type, ranges, + strings CHECK_FAILED); + switch (current()) { + case '-': + if (Next() == '-') { + if (operand == nullptr) { + operand = + zone()->template New<RegExpClassSetOperand>(ranges, strings); + } + return ParseClassSubtraction(builder, is_negated, operand, + operand_type); + } + // ClassSetRange is handled in ParseClassUnion(). + break; + case '&': + if (Next() == '&') { + if (operand == nullptr) { + operand = + zone()->template New<RegExpClassSetOperand>(ranges, strings); + } + return ParseClassIntersection(builder, is_negated, operand, + operand_type); + } + } + return ParseClassUnion(builder, is_negated, operand, operand_type, ranges, + strings); + } +} + +#undef CHECK_FAILED + +template <class CharT> +bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) { + DCHECK_NOT_NULL(result); + RegExpTree* tree = ParsePattern(); + + if (failed()) { + DCHECK_NULL(tree); + DCHECK_NE(error_, RegExpError::kNone); + result->error = error_; + result->error_pos = error_pos_; + return false; + } + + DCHECK_NOT_NULL(tree); + DCHECK_EQ(error_, RegExpError::kNone); + if (v8_flags.trace_regexp_parser) { + StdoutStream os; + tree->Print(os, zone()); + os << "\n"; + } + + result->tree = tree; + const int capture_count = captures_started(); + result->simple = tree->IsAtom() && simple() && capture_count == 0; + result->contains_anchor = contains_anchor(); + result->capture_count = capture_count; + result->named_captures = GetNamedCaptures(); + return true; +} + +void RegExpBuilder::FlushText() { text_builder().FlushText(); } + +void RegExpBuilder::AddCharacter(base::uc16 c) { + pending_empty_ = false; + text_builder().AddCharacter(c); +} + +void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) { + pending_empty_ = false; + text_builder().AddUnicodeCharacter(c); +} + +void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { + pending_empty_ = false; + text_builder().AddEscapedUnicodeCharacter(character); +} + +void RegExpBuilder::AddEmpty() { + text_builder().FlushPendingSurrogate(); + pending_empty_ = true; +} + +void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) { + pending_empty_ = false; + text_builder().AddClassRanges(cc); +} + +void RegExpBuilder::AddAtom(RegExpTree* term) { + if (term->IsEmpty()) { + AddEmpty(); + return; + } + pending_empty_ = false; + if (term->IsTextElement()) { + text_builder().AddAtom(term); + } else { + FlushText(); + terms_.emplace_back(term); + } +} + +void RegExpBuilder::AddTerm(RegExpTree* term) { + DCHECK(!term->IsEmpty()); + pending_empty_ = false; + if (term->IsTextElement()) { + text_builder().AddTerm(term); + } else { + FlushText(); + terms_.emplace_back(term); + } +} + +void RegExpBuilder::AddAssertion(RegExpTree* assert) { + FlushText(); + pending_empty_ = false; + terms_.emplace_back(assert); +} + +void RegExpBuilder::NewAlternative() { FlushTerms(); } + +void RegExpBuilder::FlushTerms() { + FlushText(); + size_t num_terms = terms_.size(); + RegExpTree* alternative; + if (num_terms == 0) { + alternative = zone()->New<RegExpEmpty>(); + } else if (num_terms == 1) { + alternative = terms_.back(); + } else { + alternative = + zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( + base::VectorOf(terms_.begin(), terms_.size()), zone())); + } + alternatives_.emplace_back(alternative); + terms_.clear(); +} + +RegExpTree* RegExpBuilder::ToRegExp() { + FlushTerms(); + size_t num_alternatives = alternatives_.size(); + if (num_alternatives == 0) return zone()->New<RegExpEmpty>(); + if (num_alternatives == 1) return alternatives_.back(); + return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>( + base::VectorOf(alternatives_.begin(), alternatives_.size()), zone())); +} + +bool RegExpBuilder::AddQuantifierToAtom( + int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { + if (pending_empty_) { + pending_empty_ = false; + return true; + } + RegExpTree* atom = text_builder().PopLastAtom(); + if (atom != nullptr) { + FlushText(); + } else if (terms_.size() > 0) { + atom = terms_.back(); + terms_.pop_back(); + if (atom->IsLookaround()) { + // With /u or /v, lookarounds are not quantifiable. + if (IsUnicodeMode()) return false; + // Lookbehinds are not quantifiable. + if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { + return false; + } + } + if (atom->max_match() == 0) { + // Guaranteed to only match an empty string. + if (min == 0) { + return true; + } + terms_.emplace_back(atom); + return true; + } + } else { + // Only call immediately after adding an atom or character! + UNREACHABLE(); + } + terms_.emplace_back( + zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom)); + return true; +} + +template class RegExpParserImpl<uint8_t>; +template class RegExpParserImpl<base::uc16>; + +} // namespace + +// static +bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, + Handle<String> input, + RegExpFlags flags, + RegExpCompileData* result) { + DisallowGarbageCollection no_gc; + uintptr_t stack_limit = isolate->stack_guard()->real_climit(); + String::FlatContent content = input->GetFlatContent(no_gc); + if (content.IsOneByte()) { + base::Vector<const uint8_t> v = content.ToOneByteVector(); + return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags, + stack_limit, zone, no_gc} + .Parse(result); + } else { + base::Vector<const base::uc16> v = content.ToUC16Vector(); + return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags, + stack_limit, zone, no_gc} + .Parse(result); + } +} + +// static +template <class CharT> +bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, + const CharT* input, int input_length, + RegExpFlags flags, + RegExpCompileData* result, + const DisallowGarbageCollection& no_gc) { + return RegExpParserImpl<CharT>{input, input_length, flags, + stack_limit, zone, no_gc} + .Parse(result); +} + +template bool RegExpParser::VerifyRegExpSyntax<uint8_t>( + Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*, + const DisallowGarbageCollection&); +template bool RegExpParser::VerifyRegExpSyntax<base::uc16>( + Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*, + const DisallowGarbageCollection&); + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-parser.h b/js/src/irregexp/imported/regexp-parser.h new file mode 100644 index 0000000000..1e45d97532 --- /dev/null +++ b/js/src/irregexp/imported/regexp-parser.h @@ -0,0 +1,34 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_PARSER_H_ +#define V8_REGEXP_REGEXP_PARSER_H_ + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class String; +class Zone; + +struct RegExpCompileData; + +class V8_EXPORT_PRIVATE RegExpParser : public AllStatic { + public: + static bool ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, + Handle<String> input, RegExpFlags flags, + RegExpCompileData* result); + + template <class CharT> + static bool VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, + const CharT* input, int input_length, + RegExpFlags flags, RegExpCompileData* result, + const DisallowGarbageCollection& no_gc); +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_PARSER_H_ diff --git a/js/src/irregexp/imported/regexp-stack.cc b/js/src/irregexp/imported/regexp-stack.cc new file mode 100644 index 0000000000..ad0aedc67a --- /dev/null +++ b/js/src/irregexp/imported/regexp-stack.cc @@ -0,0 +1,96 @@ +// Copyright 2009 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/imported/regexp-stack.h" + + +namespace v8 { +namespace internal { + +RegExpStackScope::RegExpStackScope(Isolate* isolate) + : regexp_stack_(isolate->regexp_stack()), + old_sp_top_delta_(regexp_stack_->sp_top_delta()) { + DCHECK(regexp_stack_->IsValid()); +} + +RegExpStackScope::~RegExpStackScope() { + CHECK_EQ(old_sp_top_delta_, regexp_stack_->sp_top_delta()); + regexp_stack_->ResetIfEmpty(); +} + +RegExpStack::RegExpStack() : thread_local_(this) {} + +RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); } + +char* RegExpStack::ArchiveStack(char* to) { + if (!thread_local_.owns_memory_) { + // Force dynamic stacks prior to archiving. Any growth will do. A dynamic + // stack is needed because stack archival & restoration rely on `memory_` + // pointing at a fixed-location backing store, whereas the static stack is + // tied to a RegExpStack instance. + EnsureCapacity(thread_local_.memory_size_ + 1); + DCHECK(thread_local_.owns_memory_); + } + + MemCopy(reinterpret_cast<void*>(to), &thread_local_, kThreadLocalSize); + thread_local_ = ThreadLocal(this); + return to + kThreadLocalSize; +} + + +char* RegExpStack::RestoreStack(char* from) { + MemCopy(&thread_local_, reinterpret_cast<void*>(from), kThreadLocalSize); + return from + kThreadLocalSize; +} + +void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) { + if (owns_memory_) DeleteArray(memory_); + + memory_ = regexp_stack->static_stack_; + memory_top_ = regexp_stack->static_stack_ + kStaticStackSize; + memory_size_ = kStaticStackSize; + stack_pointer_ = memory_top_; + limit_ = reinterpret_cast<Address>(regexp_stack->static_stack_) + + kStackLimitSlack * kSystemPointerSize; + owns_memory_ = false; +} + +void RegExpStack::ThreadLocal::FreeAndInvalidate() { + if (owns_memory_) DeleteArray(memory_); + + // This stack may not be used after being freed. Just reset to invalid values + // to ensure we don't accidentally use old memory areas. + memory_ = nullptr; + memory_top_ = nullptr; + memory_size_ = 0; + stack_pointer_ = nullptr; + limit_ = kMemoryTop; +} + +Address RegExpStack::EnsureCapacity(size_t size) { + if (size > kMaximumStackSize) return kNullAddress; + if (thread_local_.memory_size_ < size) { + if (size < kMinimumDynamicStackSize) size = kMinimumDynamicStackSize; + byte* new_memory = NewArray<byte>(size); + if (thread_local_.memory_size_ > 0) { + // Copy original memory into top of new memory. + MemCopy(new_memory + size - thread_local_.memory_size_, + thread_local_.memory_, thread_local_.memory_size_); + if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_); + } + ptrdiff_t delta = sp_top_delta(); + thread_local_.memory_ = new_memory; + thread_local_.memory_top_ = new_memory + size; + thread_local_.memory_size_ = size; + thread_local_.stack_pointer_ = thread_local_.memory_top_ + delta; + thread_local_.limit_ = reinterpret_cast<Address>(new_memory) + + kStackLimitSlack * kSystemPointerSize; + thread_local_.owns_memory_ = true; + } + return reinterpret_cast<Address>(thread_local_.memory_top_); +} + + +} // namespace internal +} // namespace v8 diff --git a/js/src/irregexp/imported/regexp-stack.h b/js/src/irregexp/imported/regexp-stack.h new file mode 100644 index 0000000000..f03898bb00 --- /dev/null +++ b/js/src/irregexp/imported/regexp-stack.h @@ -0,0 +1,159 @@ +// Copyright 2009 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_STACK_H_ +#define V8_REGEXP_REGEXP_STACK_H_ + +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class RegExpStack; + +// Maintains a per-v8thread stack area that can be used by irregexp +// implementation for its backtracking stack. +class V8_NODISCARD RegExpStackScope final { + public: + // Create and delete an instance to control the life-time of a growing stack. + + // Initializes the stack memory area if necessary. + explicit RegExpStackScope(Isolate* isolate); + ~RegExpStackScope(); // Releases the stack if it has grown. + RegExpStackScope(const RegExpStackScope&) = delete; + RegExpStackScope& operator=(const RegExpStackScope&) = delete; + + RegExpStack* stack() const { return regexp_stack_; } + + private: + RegExpStack* const regexp_stack_; + const ptrdiff_t old_sp_top_delta_; +}; + +class RegExpStack final { + public: + RegExpStack(); + ~RegExpStack(); + RegExpStack(const RegExpStack&) = delete; + RegExpStack& operator=(const RegExpStack&) = delete; + + // Number of allocated locations on the stack below the limit. No sequence of + // pushes must be longer than this without doing a stack-limit check. + static constexpr int kStackLimitSlack = 32; + + Address memory_top() const { + DCHECK_NE(0, thread_local_.memory_size_); + DCHECK_EQ(thread_local_.memory_top_, + thread_local_.memory_ + thread_local_.memory_size_); + return reinterpret_cast<Address>(thread_local_.memory_top_); + } + + Address stack_pointer() const { + return reinterpret_cast<Address>(thread_local_.stack_pointer_); + } + + size_t memory_size() const { return thread_local_.memory_size_; } + + // If the stack pointer gets below the limit, we should react and + // either grow the stack or report an out-of-stack exception. + // There is only a limited number of locations below the stack limit, + // so users of the stack should check the stack limit during any + // sequence of pushes longer that this. + Address* limit_address_address() { return &thread_local_.limit_; } + + // Ensures that there is a memory area with at least the specified size. + // If passing zero, the default/minimum size buffer is allocated. + Address EnsureCapacity(size_t size); + + // Thread local archiving. + static constexpr int ArchiveSpacePerThread() { + return static_cast<int>(kThreadLocalSize); + } + char* ArchiveStack(char* to); + char* RestoreStack(char* from); + void FreeThreadResources() { thread_local_.ResetToStaticStack(this); } + + // Maximal size of allocated stack area. + static constexpr size_t kMaximumStackSize = 64 * MB; + + private: + // Artificial limit used when the thread-local state has been destroyed. + static const Address kMemoryTop = + static_cast<Address>(static_cast<uintptr_t>(-1)); + + // Minimal size of dynamically-allocated stack area. + static constexpr size_t kMinimumDynamicStackSize = 1 * KB; + + // In addition to dynamically-allocated, variable-sized stacks, we also have + // a statically allocated and sized area that is used whenever no dynamic + // stack is allocated. This guarantees that a stack is always available and + // we can skip availability-checks later on. + // It's double the slack size to ensure that we have a bit of breathing room + // before NativeRegExpMacroAssembler::GrowStack must be called. + static constexpr size_t kStaticStackSize = + 2 * kStackLimitSlack * kSystemPointerSize; + byte static_stack_[kStaticStackSize] = {0}; + + static_assert(kStaticStackSize <= kMaximumStackSize); + + // Structure holding the allocated memory, size and limit. Thread switching + // archives and restores this struct. + struct ThreadLocal { + explicit ThreadLocal(RegExpStack* regexp_stack) { + ResetToStaticStack(regexp_stack); + } + + // If memory_size_ > 0 then + // - memory_, memory_top_, stack_pointer_ must be non-nullptr + // - memory_top_ = memory_ + memory_size_ + // - memory_ <= stack_pointer_ <= memory_top_ + byte* memory_ = nullptr; + byte* memory_top_ = nullptr; + size_t memory_size_ = 0; + byte* stack_pointer_ = nullptr; + Address limit_ = kNullAddress; + bool owns_memory_ = false; // Whether memory_ is owned and must be freed. + + void ResetToStaticStack(RegExpStack* regexp_stack); + void ResetToStaticStackIfEmpty(RegExpStack* regexp_stack) { + if (stack_pointer_ == memory_top_) ResetToStaticStack(regexp_stack); + } + void FreeAndInvalidate(); + }; + static constexpr size_t kThreadLocalSize = sizeof(ThreadLocal); + + Address memory_top_address_address() { + return reinterpret_cast<Address>(&thread_local_.memory_top_); + } + + Address stack_pointer_address() { + return reinterpret_cast<Address>(&thread_local_.stack_pointer_); + } + + // A position-independent representation of the stack pointer. + ptrdiff_t sp_top_delta() const { + ptrdiff_t result = + reinterpret_cast<intptr_t>(thread_local_.stack_pointer_) - + reinterpret_cast<intptr_t>(thread_local_.memory_top_); + DCHECK_LE(result, 0); + return result; + } + + // Resets the buffer if it has grown beyond the default/minimum size and is + // empty. + void ResetIfEmpty() { thread_local_.ResetToStaticStackIfEmpty(this); } + + // Whether the ThreadLocal storage has been invalidated. + bool IsValid() const { return thread_local_.memory_ != nullptr; } + + ThreadLocal thread_local_; + + friend class ExternalReference; + friend class RegExpStackScope; +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_STACK_H_ diff --git a/js/src/irregexp/imported/regexp.h b/js/src/irregexp/imported/regexp.h new file mode 100644 index 0000000000..50269a4b71 --- /dev/null +++ b/js/src/irregexp/imported/regexp.h @@ -0,0 +1,236 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_H_ +#define V8_REGEXP_REGEXP_H_ + +#include "irregexp/imported/regexp-error.h" +#include "irregexp/RegExpShim.h" + +namespace v8 { +namespace internal { + +class JSRegExp; +class RegExpCapture; +class RegExpMatchInfo; +class RegExpNode; +class RegExpTree; + +enum class RegExpCompilationTarget : int { kBytecode, kNative }; + +// TODO(jgruber): Do not expose in regexp.h. +// TODO(jgruber): Consider splitting between ParseData and CompileData. +struct RegExpCompileData { + // The parsed AST as produced by the RegExpParser. + RegExpTree* tree = nullptr; + + // The compiled Node graph as produced by RegExpTree::ToNode methods. + RegExpNode* node = nullptr; + + // Either the generated code as produced by the compiler or a trampoline + // to the interpreter. + Handle<Object> code; + + // True, iff the pattern is a 'simple' atom with zero captures. In other + // words, the pattern consists of a string with no metacharacters and special + // regexp features, and can be implemented as a standard string search. + bool simple = true; + + // True, iff the pattern is anchored at the start of the string with '^'. + bool contains_anchor = false; + + // Only set if the pattern contains named captures. + // Note: the lifetime equals that of the parse/compile zone. + ZoneVector<RegExpCapture*>* named_captures = nullptr; + + // The error message. Only used if an error occurred during parsing or + // compilation. + RegExpError error = RegExpError::kNone; + + // The position at which the error was detected. Only used if an + // error occurred. + int error_pos = 0; + + // The number of capture groups, without the global capture \0. + int capture_count = 0; + + // The number of registers used by the generated code. + int register_count = 0; + + // The compilation target (bytecode or native code). + RegExpCompilationTarget compilation_target; +}; + +class RegExp final : public AllStatic { + public: + // Whether the irregexp engine generates interpreter bytecode. + static bool CanGenerateBytecode(); + + // Verify that the given flags combination is valid. + V8_EXPORT_PRIVATE static bool VerifyFlags(RegExpFlags flags); + + // Verify the given pattern, i.e. check that parsing succeeds. If + // verification fails, `regexp_error_out` is set. + template <class CharT> + static bool VerifySyntax(Zone* zone, uintptr_t stack_limit, + const CharT* input, int input_length, + RegExpFlags flags, RegExpError* regexp_error_out, + const DisallowGarbageCollection& no_gc); + + // Parses the RegExp pattern and prepares the JSRegExp object with + // generic data and choice of implementation - as well as what + // the implementation wants to store in the data field. + // Returns false if compilation fails. + V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile( + Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, + RegExpFlags flags, uint32_t backtrack_limit); + + // Ensures that a regexp is fully compiled and ready to be executed on a + // subject string. Returns true on success. Return false on failure, and + // then an exception will be pending. + V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate, + Handle<JSRegExp> re, + Handle<String> subject); + + enum CallOrigin : int { + kFromRuntime = 0, + kFromJs = 1, + }; + + enum class ExecQuirks { + kNone, + // Used to work around an issue in the RegExpPrototypeSplit fast path, + // which diverges from the spec by not creating a sticky copy of the RegExp + // instance and calling `exec` in a loop. If called in this context, we + // must not update the last_match_info on a successful match at the subject + // string end. See crbug.com/1075514 for more information. + kTreatMatchAtEndAsFailure, + }; + + // See ECMA-262 section 15.10.6.2. + // This function calls the garbage collector if necessary. + V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec( + Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, + int index, Handle<RegExpMatchInfo> last_match_info, + ExecQuirks exec_quirks = ExecQuirks::kNone); + + V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> + ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp, + Handle<String> subject, int index, + Handle<RegExpMatchInfo> last_match_info, + ExecQuirks exec_quirks = ExecQuirks::kNone); + + // Integral return values used throughout regexp code layers. + static constexpr int kInternalRegExpFailure = 0; + static constexpr int kInternalRegExpSuccess = 1; + static constexpr int kInternalRegExpException = -1; + static constexpr int kInternalRegExpRetry = -2; + static constexpr int kInternalRegExpFallbackToExperimental = -3; + static constexpr int kInternalRegExpSmallestResult = -3; + + enum IrregexpResult : int32_t { + RE_FAILURE = kInternalRegExpFailure, + RE_SUCCESS = kInternalRegExpSuccess, + RE_EXCEPTION = kInternalRegExpException, + RE_RETRY = kInternalRegExpRetry, + RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental, + }; + + // Set last match info. If match is nullptr, then setting captures is + // omitted. + static Handle<RegExpMatchInfo> SetLastMatchInfo( + Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, + Handle<String> subject, int capture_count, int32_t* match); + + V8_EXPORT_PRIVATE static bool CompileForTesting( + Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags, + Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte); + + V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label, + RegExpNode* node); + + static const int kRegExpTooLargeToOptimize = 20 * KB; + + V8_WARN_UNUSED_RESULT + static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate, + Handle<JSRegExp> re, + RegExpFlags flags, + Handle<String> pattern, + RegExpError error); + static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, + RegExpError error_text); + + static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp); + + static Handle<FixedArray> CreateCaptureNameMap( + Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures); +}; + +// Uses a special global mode of irregexp-generated code to perform a global +// search and return multiple results at once. As such, this is essentially an +// iterator over multiple results (retrieved batch-wise in advance). +class RegExpGlobalCache final { + public: + RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject, + Isolate* isolate); + + ~RegExpGlobalCache(); + + // Fetch the next entry in the cache for global regexp match results. + // This does not set the last match info. Upon failure, nullptr is + // returned. The cause can be checked with Result(). The previous result is + // still in available in memory when a failure happens. + int32_t* FetchNext(); + + int32_t* LastSuccessfulMatch(); + + bool HasException() { return num_matches_ < 0; } + + private: + int AdvanceZeroLength(int last_index); + + int num_matches_; + int max_matches_; + int current_match_index_; + int registers_per_match_; + // Pointer to the last set of captures. + int32_t* register_array_; + int register_array_size_; + Handle<JSRegExp> regexp_; + Handle<String> subject_; + Isolate* isolate_; +}; + +// Caches results for specific regexp queries on the isolate. At the time of +// writing, this is used during global calls to RegExp.prototype.exec and +// @@split. +class RegExpResultsCache final : public AllStatic { + public: + enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS }; + + // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi. + // On success, the returned result is guaranteed to be a COW-array. + static Object Lookup(Heap* heap, String key_string, Object key_pattern, + FixedArray* last_match_out, ResultsCacheType type); + // Attempt to add value_array to the cache specified by type. On success, + // value_array is turned into a COW-array. + static void Enter(Isolate* isolate, Handle<String> key_string, + Handle<Object> key_pattern, Handle<FixedArray> value_array, + Handle<FixedArray> last_match_cache, ResultsCacheType type); + static void Clear(FixedArray cache); + + static constexpr int kRegExpResultsCacheSize = 0x100; + + private: + static constexpr int kStringOffset = 0; + static constexpr int kPatternOffset = 1; + static constexpr int kArrayOffset = 2; + static constexpr int kLastMatchOffset = 3; + static constexpr int kArrayEntriesPerCacheEntry = 4; +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_H_ diff --git a/js/src/irregexp/imported/special-case.cc b/js/src/irregexp/imported/special-case.cc new file mode 100644 index 0000000000..f5a9928b3a --- /dev/null +++ b/js/src/irregexp/imported/special-case.cc @@ -0,0 +1,111 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that +// can be found in the LICENSE file. + +// Automatically generated by regexp/gen-regexp-special-case.cc + +// The following functions are used to build UnicodeSets +// for special cases where the case-folding algorithm used by +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match +// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime +// Semantics: Canonicalize) step 3. + +#ifdef V8_INTL_SUPPORT +#include "irregexp/imported/special-case.h" + +#include "unicode/uniset.h" +namespace v8 { +namespace internal { + +icu::UnicodeSet BuildIgnoreSet() { + icu::UnicodeSet set; + set.add(0xdf); + set.add(0x17f); + set.add(0x390); + set.add(0x3b0); + set.add(0x3f4); + set.add(0x1e9e); + set.add(0x1f80, 0x1faf); + set.add(0x1fb3); + set.add(0x1fbc); + set.add(0x1fc3); + set.add(0x1fcc); + set.add(0x1fd3); + set.add(0x1fe3); + set.add(0x1ff3); + set.add(0x1ffc); + set.add(0x2126); + set.add(0x212a, 0x212b); + set.add(0xfb05, 0xfb06); + set.freeze(); + return set; +} + +struct IgnoreSetData { + IgnoreSetData() : set(BuildIgnoreSet()) {} + const icu::UnicodeSet set; +}; + +//static +const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() { + static base::LazyInstance<IgnoreSetData>::type set = + LAZY_INSTANCE_INITIALIZER; + return set.Pointer()->set; +} + +icu::UnicodeSet BuildSpecialAddSet() { + icu::UnicodeSet set; + set.add(0x4b); + set.add(0x53); + set.add(0x6b); + set.add(0x73); + set.add(0xc5); + set.add(0xe5); + set.add(0x398); + set.add(0x3a9); + set.add(0x3b8); + set.add(0x3c9); + set.add(0x3d1); + set.freeze(); + return set; +} + +struct SpecialAddSetData { + SpecialAddSetData() : set(BuildSpecialAddSet()) {} + const icu::UnicodeSet set; +}; + +//static +const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() { + static base::LazyInstance<SpecialAddSetData>::type set = + LAZY_INSTANCE_INITIALIZER; + return set.Pointer()->set; +} + +icu::UnicodeSet BuildUnicodeNonSimpleCloseOverSet() { + icu::UnicodeSet set; + set.add(0x390); + set.add(0x3b0); + set.add(0x1fd3); + set.add(0x1fe3); + set.add(0xfb05, 0xfb06); + set.freeze(); + return set; +} + +struct UnicodeNonSimpleCloseOverSetData { + UnicodeNonSimpleCloseOverSetData() : set(BuildUnicodeNonSimpleCloseOverSet()) {} + const icu::UnicodeSet set; +}; + +//static +const icu::UnicodeSet& RegExpCaseFolding::UnicodeNonSimpleCloseOverSet() { + static base::LazyInstance<UnicodeNonSimpleCloseOverSetData>::type set = + LAZY_INSTANCE_INITIALIZER; + return set.Pointer()->set; +} + + +} // namespace internal +} // namespace v8 +#endif // V8_INTL_SUPPORT diff --git a/js/src/irregexp/imported/special-case.h b/js/src/irregexp/imported/special-case.h new file mode 100644 index 0000000000..ea511af5a4 --- /dev/null +++ b/js/src/irregexp/imported/special-case.h @@ -0,0 +1,127 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_SPECIAL_CASE_H_ +#define V8_REGEXP_SPECIAL_CASE_H_ + +#ifdef V8_INTL_SUPPORT +#include "irregexp/RegExpShim.h" + +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" + +namespace v8 { +namespace internal { + +// Sets of Unicode characters that need special handling under "i" mode + +// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262 +// defines slightly different case-folding rules than Unicode. An +// input character should match a pattern character if the result of +// the Canonicalize algorithm is the same for both characters. +// +// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as +// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character +// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See +// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for +// the precise definition. +// +// While compiling such regular expressions, we need to compute the +// set of characters that should match a given input character. (See +// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.) +// For almost all characters, this can be efficiently computed using +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent +// the remaining special cases. +// +// For a character c, the rules are as follows: +// +// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet +// containing c will produce the set of characters that should +// match /c/i (or /[c]/i), and only those characters. +// +// 2. If c is in IgnoreSet, then the only character it should match is +// itself. However, closeOver will add additional incorrect +// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ' +// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is +// "SS". Step 3.e therefore requires that 'ß' canonicalizes to +// itself, and should not match 'ẞ'. In these cases, we can skip +// the closeOver entirely, because it will never add an equivalent +// character. +// +// 3. If c is in SpecialAddSet, then it should match at least one +// character other than itself. However, closeOver will add at +// least one additional incorrect match. For example, consider the +// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase +// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN +// SIGN should not match either of the other two characters. As a +// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in +// IgnoreSet). To find the correct matches for characters in +// SpecialAddSet, we closeOver the original character, but filter +// out the results that do not have the same canonical value. +// +// The contents of these sets are calculated at build time by +// src/regexp/gen-regexp-special-case.cc, which generates +// gen/src/regexp/special-case.cc. This is done by iterating over the +// result of closeOver for each BMP character, and finding sets for +// which at least one character has a different canonical value than +// another character. Characters that match no other characters in +// their equivalence class are added to IgnoreSet. Characters that +// match at least one other character are added to SpecialAddSet. +// +// For unicode ignoreCase ("iu" and "iv"), +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in +// the same equivalence class. This includes characaters that are in the same +// equivalence class using full case folding. According to the spec, only +// simple case folding shall be considered. We therefore create +// UnicodeNonSimpleCloseOverSet containing all characters for which +// UnicodeSet::closeOver adds characters that are not simple case folds. This +// set should be used similar to IgnoreSet described above. + +class RegExpCaseFolding final : public AllStatic { + public: + static const icu::UnicodeSet& IgnoreSet(); + static const icu::UnicodeSet& SpecialAddSet(); + static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet(); + + // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: + // Canonicalize) step 3, which is used to determine whether + // characters match when ignoreCase is true and unicode is false. + static UChar32 Canonicalize(UChar32 ch) { + // a. Assert: ch is a UTF-16 code unit. + CHECK_LE(ch, 0xffff); + + // b. Let s be the String value consisting of the single code unit ch. + icu::UnicodeString s(ch); + + // c. Let u be the same result produced as if by performing the algorithm + // for String.prototype.toUpperCase using s as the this value. + // d. Assert: Type(u) is String. + icu::UnicodeString& u = s.toUpper(); + + // e. If u does not consist of a single code unit, return ch. + if (u.length() != 1) { + return ch; + } + + // f. Let cu be u's single code unit element. + UChar32 cu = u.char32At(0); + + // g. If the value of ch >= 128 and the value of cu < 128, return ch. + if (ch >= 128 && cu < 128) { + return ch; + } + + // h. Return cu. + return cu; + } +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_INTL_SUPPORT + +#endif // V8_REGEXP_SPECIAL_CASE_H_ diff --git a/js/src/irregexp/moz.build b/js/src/irregexp/moz.build new file mode 100644 index 0000000000..ff030ad4bd --- /dev/null +++ b/js/src/irregexp/moz.build @@ -0,0 +1,49 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +FINAL_LIBRARY = "js" + +# Includes should be relative to parent path +LOCAL_INCLUDES += ["!..", ".."] + +include("../js-config.mozbuild") +include("../js-cxxflags.mozbuild") + +CXXFLAGS += ["-Wno-error=type-limits", "-Wno-error=return-type"] + +# Suppress spurious warnings in third-party code. See bug 1810584. +if CONFIG["CC_TYPE"] == "gcc": + CXXFLAGS += ["-Wno-error=nonnull"] + +UNIFIED_SOURCES += [ + "imported/regexp-bytecode-generator.cc", + "imported/regexp-bytecode-peephole.cc", + "imported/regexp-bytecodes.cc", + "imported/regexp-compiler-tonode.cc", + "imported/regexp-dotprinter.cc", + "imported/regexp-interpreter.cc", + "imported/regexp-macro-assembler-tracer.cc", + "imported/regexp-macro-assembler.cc", + "imported/regexp-parser.cc", + "imported/regexp-stack.cc", + "RegExpAPI.cpp", + "RegExpShim.cpp", + "util/UnicodeShim.cpp", +] +SOURCES += [ + "imported/regexp-ast.cc", + "imported/regexp-compiler.cc", # Bug 1643693 + "RegExpNativeMacroAssembler.cpp", +] + +if CONFIG["JS_HAS_INTL_API"]: + CXXFLAGS += ["-DV8_INTL_SUPPORT"] + UNIFIED_SOURCES += ["imported/property-sequences.cc", "imported/special-case.cc"] + +# Make sure all irregexp code is built with libfuzzer +# coverage instrumentation in FUZZING mode. +if CONFIG["FUZZING_INTERFACES"] and CONFIG["LIBFUZZER"]: + include("/tools/fuzzing/libfuzzer-config.mozbuild") diff --git a/js/src/irregexp/moz.yaml b/js/src/irregexp/moz.yaml new file mode 100644 index 0000000000..b42066e148 --- /dev/null +++ b/js/src/irregexp/moz.yaml @@ -0,0 +1,36 @@ +schema: 1 + +bugzilla: + product: Core + component: "JavaScript Engine" + +origin: + name: irregexp + description: A fast regular expression engine from V8 + url: https://v8.dev + + release: d94dfc2b01f988566aa410ce871588cf23b1285d (Thu May 04 20:02:50 2023). + revision: d94dfc2b01f988566aa410ce871588cf23b1285d + + license: BSD-3-Clause + license-file: LICENSE.v8 + +vendoring: + url: https://chromium.googlesource.com/v8/v8.git + source-hosting: googlesource + vendor-directory: js/src/irregexp/ + skip-vendoring-steps: ['fetch', 'move-contents'] + + update-actions: + - action: run-script + script: 'import-irregexp.py' + cwd: '{yaml_dir}' + +updatebot: + maintainer-phab: iain + maintainer-bz: iireland@mozilla.com + try-preset: sm-shell + tasks: + - type: vendoring + enabled: True + frequency: 1 week diff --git a/js/src/irregexp/util/FlagsShim.h b/js/src/irregexp/util/FlagsShim.h new file mode 100644 index 0000000000..28211a42bf --- /dev/null +++ b/js/src/irregexp/util/FlagsShim.h @@ -0,0 +1,93 @@ +// Copyright 2014 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_UTIL_FLAGS_H_ +#define V8_UTIL_FLAGS_H_ + +// Origin: +// https://github.com/v8/v8/blob/1bafcc6b999b23ea1d394f5d267a08183e3c4e19/src/base/flags.h#L15-L90 + +namespace v8 { +namespace base { + +// The Flags class provides a type-safe way of storing OR-combinations of enum +// values. The Flags<T, S> class is a template class, where T is an enum type, +// and S is the underlying storage type (usually int). +// +// The traditional C++ approach for storing OR-combinations of enum values is to +// use an int or unsigned int variable. The inconvenience with this approach is +// that there's no type checking at all; any enum value can be OR'd with any +// other enum value and passed on to a function that takes an int or unsigned +// int. +template <typename T, typename S = int> +class Flags final { + public: + using flag_type = T; + using mask_type = S; + + constexpr Flags() : mask_(0) {} + constexpr Flags(flag_type flag) : mask_(static_cast<S>(flag)) {} + constexpr explicit Flags(mask_type mask) : mask_(static_cast<S>(mask)) {} + + constexpr bool operator==(flag_type flag) const { + return mask_ == static_cast<S>(flag); + } + constexpr bool operator!=(flag_type flag) const { + return mask_ != static_cast<S>(flag); + } + + Flags& operator&=(const Flags& flags) { + mask_ &= flags.mask_; + return *this; + } + Flags& operator|=(const Flags& flags) { + mask_ |= flags.mask_; + return *this; + } + Flags& operator^=(const Flags& flags) { + mask_ ^= flags.mask_; + return *this; + } + + constexpr Flags operator&(const Flags& flags) const { + return Flags(mask_ & flags.mask_); + } + constexpr Flags operator|(const Flags& flags) const { + return Flags(mask_ | flags.mask_); + } + constexpr Flags operator^(const Flags& flags) const { + return Flags(mask_ ^ flags.mask_); + } + + Flags& operator&=(flag_type flag) { return operator&=(Flags(flag)); } + Flags& operator|=(flag_type flag) { return operator|=(Flags(flag)); } + Flags& operator^=(flag_type flag) { return operator^=(Flags(flag)); } + + constexpr Flags operator&(flag_type flag) const { + return operator&(Flags(flag)); + } + constexpr Flags operator|(flag_type flag) const { + return operator|(Flags(flag)); + } + constexpr Flags operator^(flag_type flag) const { + return operator^(Flags(flag)); + } + + constexpr Flags operator~() const { return Flags(~mask_); } + + constexpr operator mask_type() const { return mask_; } + constexpr bool operator!() const { return !mask_; } + + Flags without(flag_type flag) { return *this & (~Flags(flag)); } + + friend size_t hash_value(const Flags& flags) { return flags.mask_; } + + private: + mask_type mask_; +}; + +} // namespace base +} // namespace v8 + +#endif // V8_UTIL_FLAG_H_ diff --git a/js/src/irregexp/util/UnicodeShim.cpp b/js/src/irregexp/util/UnicodeShim.cpp new file mode 100644 index 0000000000..e2784e9e44 --- /dev/null +++ b/js/src/irregexp/util/UnicodeShim.cpp @@ -0,0 +1,1866 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// This file is a subset of: +// https://github.com/v8/v8/blob/master/src/strings/unicode.cc + +#include "irregexp/RegExpShim.h" + +#ifdef V8_INTL_SUPPORT +# include "unicode/uchar.h" +#endif + +namespace v8 { +namespace unibrow { + +#ifndef V8_INTL_SUPPORT +static const int kStartBit = (1 << 30); +static const int kChunkBits = (1 << 13); +#endif // !V8_INTL_SUPPORT + +static const uchar kSentinel = static_cast<uchar>(-1); + +/** + * \file + * Implementations of functions for working with Unicode. + */ + +using int16_t = signed short; // NOLINT +using uint16_t = unsigned short; // NOLINT +using int32_t = int; // NOLINT + +#ifndef V8_INTL_SUPPORT +// All access to the character table should go through this function. +template <int D> +static inline uchar TableGet(const int32_t* table, int index) { + return table[D * index]; +} + +static inline uchar GetEntry(int32_t entry) { return entry & (kStartBit - 1); } + +static inline bool IsStart(int32_t entry) { return (entry & kStartBit) != 0; } + +/** + * Look up a character in the Unicode table using a mix of binary and + * interpolation search. For a uniformly distributed array + * interpolation search beats binary search by a wide margin. However, + * in this case interpolation search degenerates because of some very + * high values in the lower end of the table so this function uses a + * combination. The average number of steps to look up the information + * about a character is around 10, slightly higher if there is no + * information available about the character. + */ +static bool LookupPredicate(const int32_t* table, uint16_t size, uchar chr) { + static const int kEntryDist = 1; + uint16_t value = chr & (kChunkBits - 1); + unsigned int low = 0; + unsigned int high = size - 1; + while (high != low) { + unsigned int mid = low + ((high - low) >> 1); + uchar current_value = GetEntry(TableGet<kEntryDist>(table, mid)); + // If we've found an entry less than or equal to this one, and the + // next one is not also less than this one, we've arrived. + if ((current_value <= value) && + (mid + 1 == size || + GetEntry(TableGet<kEntryDist>(table, mid + 1)) > value)) { + low = mid; + break; + } else if (current_value < value) { + low = mid + 1; + } else if (current_value > value) { + // If we've just checked the bottom-most value and it's not + // the one we're looking for, we're done. + if (mid == 0) break; + high = mid - 1; + } + } + int32_t field = TableGet<kEntryDist>(table, low); + uchar entry = GetEntry(field); + bool is_start = IsStart(field); + return (entry == value) || (entry < value && is_start); +} +#endif // !V8_INTL_SUPPORT + +template <int kW> +struct MultiCharacterSpecialCase { + static const uchar kEndOfEncoding = kSentinel; + uchar chars[kW]; +}; + +#ifndef V8_INTL_SUPPORT +// Look up the mapping for the given character in the specified table, +// which is of the specified length and uses the specified special case +// mapping for multi-char mappings. The next parameter is the character +// following the one to map. The result will be written in to the result +// buffer and the number of characters written will be returned. Finally, +// if the allow_caching_ptr is non-null then false will be stored in +// it if the result contains multiple characters or depends on the +// context. +// If ranges are linear, a match between a start and end point is +// offset by the distance between the match and the start. Otherwise +// the result is the same as for the start point on the entire range. +template <bool ranges_are_linear, int kW> +static int LookupMapping(const int32_t* table, uint16_t size, + const MultiCharacterSpecialCase<kW>* multi_chars, + uchar chr, uchar next, uchar* result, + bool* allow_caching_ptr) { + static const int kEntryDist = 2; + uint16_t key = chr & (kChunkBits - 1); + uint16_t chunk_start = chr - key; + unsigned int low = 0; + unsigned int high = size - 1; + while (high != low) { + unsigned int mid = low + ((high - low) >> 1); + uchar current_value = GetEntry(TableGet<kEntryDist>(table, mid)); + // If we've found an entry less than or equal to this one, and the next one + // is not also less than this one, we've arrived. + if ((current_value <= key) && + (mid + 1 == size || + GetEntry(TableGet<kEntryDist>(table, mid + 1)) > key)) { + low = mid; + break; + } else if (current_value < key) { + low = mid + 1; + } else if (current_value > key) { + // If we've just checked the bottom-most value and it's not + // the one we're looking for, we're done. + if (mid == 0) break; + high = mid - 1; + } + } + int32_t field = TableGet<kEntryDist>(table, low); + uchar entry = GetEntry(field); + bool is_start = IsStart(field); + bool found = (entry == key) || (entry < key && is_start); + if (found) { + int32_t value = table[2 * low + 1]; + if (value == 0) { + // 0 means not present + return 0; + } else if ((value & 3) == 0) { + // Low bits 0 means a constant offset from the given character. + if (ranges_are_linear) { + result[0] = chr + (value >> 2); + } else { + result[0] = entry + chunk_start + (value >> 2); + } + return 1; + } else if ((value & 3) == 1) { + // Low bits 1 means a special case mapping + if (allow_caching_ptr) *allow_caching_ptr = false; + const MultiCharacterSpecialCase<kW>& mapping = multi_chars[value >> 2]; + int length = 0; + for (length = 0; length < kW; length++) { + uchar mapped = mapping.chars[length]; + if (mapped == MultiCharacterSpecialCase<kW>::kEndOfEncoding) break; + if (ranges_are_linear) { + result[length] = mapped + (key - entry); + } else { + result[length] = mapped; + } + } + return length; + } else { + // Low bits 2 means a really really special case + if (allow_caching_ptr) *allow_caching_ptr = false; + // The cases of this switch are defined in unicode.py in the + // really_special_cases mapping. + switch (value >> 2) { + case 1: + // Really special case 1: upper case sigma. This letter + // converts to two different lower case sigmas depending on + // whether or not it occurs at the end of a word. + if (next != 0 && Letter::Is(next)) { + result[0] = 0x03C3; + } else { + result[0] = 0x03C2; + } + return 1; + default: + return 0; + } + return -1; + } + } else { + return 0; + } +} +#endif // !V8_INTL_SUPPORT + +// Letter: point.category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'] +#ifdef V8_INTL_SUPPORT +bool Letter::Is(uchar c) { return static_cast<bool>(u_isalpha(c)); } +#else +static const uint16_t kLetterTable0Size = 431; +static const int32_t kLetterTable0[431] = { + 1073741889, 90, 1073741921, 122, + 170, 181, 186, 1073742016, // NOLINT + 214, 1073742040, 246, 1073742072, + 705, 1073742534, 721, 1073742560, // NOLINT + 740, 748, 750, 1073742704, + 884, 1073742710, 887, 1073742714, // NOLINT + 893, 895, 902, 1073742728, + 906, 908, 1073742734, 929, // NOLINT + 1073742755, 1013, 1073742839, 1153, + 1073742986, 1327, 1073743153, 1366, // NOLINT + 1369, 1073743201, 1415, 1073743312, + 1514, 1073743344, 1522, 1073743392, // NOLINT + 1610, 1073743470, 1647, 1073743473, + 1747, 1749, 1073743589, 1766, // NOLINT + 1073743598, 1775, 1073743610, 1788, + 1791, 1808, 1073743634, 1839, // NOLINT + 1073743693, 1957, 1969, 1073743818, + 2026, 1073743860, 2037, 2042, // NOLINT + 1073743872, 2069, 2074, 2084, + 2088, 1073743936, 2136, 1073744032, // NOLINT + 2226, 1073744132, 2361, 2365, + 2384, 1073744216, 2401, 1073744241, // NOLINT + 2432, 1073744261, 2444, 1073744271, + 2448, 1073744275, 2472, 1073744298, // NOLINT + 2480, 2482, 1073744310, 2489, + 2493, 2510, 1073744348, 2525, // NOLINT + 1073744351, 2529, 1073744368, 2545, + 1073744389, 2570, 1073744399, 2576, // NOLINT + 1073744403, 2600, 1073744426, 2608, + 1073744434, 2611, 1073744437, 2614, // NOLINT + 1073744440, 2617, 1073744473, 2652, + 2654, 1073744498, 2676, 1073744517, // NOLINT + 2701, 1073744527, 2705, 1073744531, + 2728, 1073744554, 2736, 1073744562, // NOLINT + 2739, 1073744565, 2745, 2749, + 2768, 1073744608, 2785, 1073744645, // NOLINT + 2828, 1073744655, 2832, 1073744659, + 2856, 1073744682, 2864, 1073744690, // NOLINT + 2867, 1073744693, 2873, 2877, + 1073744732, 2909, 1073744735, 2913, // NOLINT + 2929, 2947, 1073744773, 2954, + 1073744782, 2960, 1073744786, 2965, // NOLINT + 1073744793, 2970, 2972, 1073744798, + 2975, 1073744803, 2980, 1073744808, // NOLINT + 2986, 1073744814, 3001, 3024, + 1073744901, 3084, 1073744910, 3088, // NOLINT + 1073744914, 3112, 1073744938, 3129, + 3133, 1073744984, 3161, 1073744992, // NOLINT + 3169, 1073745029, 3212, 1073745038, + 3216, 1073745042, 3240, 1073745066, // NOLINT + 3251, 1073745077, 3257, 3261, + 3294, 1073745120, 3297, 1073745137, // NOLINT + 3314, 1073745157, 3340, 1073745166, + 3344, 1073745170, 3386, 3389, // NOLINT + 3406, 1073745248, 3425, 1073745274, + 3455, 1073745285, 3478, 1073745306, // NOLINT + 3505, 1073745331, 3515, 3517, + 1073745344, 3526, 1073745409, 3632, // NOLINT + 1073745458, 3635, 1073745472, 3654, + 1073745537, 3714, 3716, 1073745543, // NOLINT + 3720, 3722, 3725, 1073745556, + 3735, 1073745561, 3743, 1073745569, // NOLINT + 3747, 3749, 3751, 1073745578, + 3755, 1073745581, 3760, 1073745586, // NOLINT + 3763, 3773, 1073745600, 3780, + 3782, 1073745628, 3807, 3840, // NOLINT + 1073745728, 3911, 1073745737, 3948, + 1073745800, 3980, 1073745920, 4138, // NOLINT + 4159, 1073746000, 4181, 1073746010, + 4189, 4193, 1073746021, 4198, // NOLINT + 1073746030, 4208, 1073746037, 4225, + 4238, 1073746080, 4293, 4295, // NOLINT + 4301, 1073746128, 4346, 1073746172, + 4680, 1073746506, 4685, 1073746512, // NOLINT + 4694, 4696, 1073746522, 4701, + 1073746528, 4744, 1073746570, 4749, // NOLINT + 1073746576, 4784, 1073746610, 4789, + 1073746616, 4798, 4800, 1073746626, // NOLINT + 4805, 1073746632, 4822, 1073746648, + 4880, 1073746706, 4885, 1073746712, // NOLINT + 4954, 1073746816, 5007, 1073746848, + 5108, 1073746945, 5740, 1073747567, // NOLINT + 5759, 1073747585, 5786, 1073747616, + 5866, 1073747694, 5880, 1073747712, // NOLINT + 5900, 1073747726, 5905, 1073747744, + 5937, 1073747776, 5969, 1073747808, // NOLINT + 5996, 1073747822, 6000, 1073747840, + 6067, 6103, 6108, 1073748000, // NOLINT + 6263, 1073748096, 6312, 6314, + 1073748144, 6389, 1073748224, 6430, // NOLINT + 1073748304, 6509, 1073748336, 6516, + 1073748352, 6571, 1073748417, 6599, // NOLINT + 1073748480, 6678, 1073748512, 6740, + 6823, 1073748741, 6963, 1073748805, // NOLINT + 6987, 1073748867, 7072, 1073748910, + 7087, 1073748922, 7141, 1073748992, // NOLINT + 7203, 1073749069, 7247, 1073749082, + 7293, 1073749225, 7404, 1073749230, // NOLINT + 7409, 1073749237, 7414, 1073749248, + 7615, 1073749504, 7957, 1073749784, // NOLINT + 7965, 1073749792, 8005, 1073749832, + 8013, 1073749840, 8023, 8025, // NOLINT + 8027, 8029, 1073749855, 8061, + 1073749888, 8116, 1073749942, 8124, // NOLINT + 8126, 1073749954, 8132, 1073749958, + 8140, 1073749968, 8147, 1073749974, // NOLINT + 8155, 1073749984, 8172, 1073750002, + 8180, 1073750006, 8188}; // NOLINT +static const uint16_t kLetterTable1Size = 87; +static const int32_t kLetterTable1[87] = { + 113, 127, 1073741968, 156, + 258, 263, 1073742090, 275, // NOLINT + 277, 1073742105, 285, 292, + 294, 296, 1073742122, 301, // NOLINT + 1073742127, 313, 1073742140, 319, + 1073742149, 329, 334, 1073742176, // NOLINT + 392, 1073744896, 3118, 1073744944, + 3166, 1073744992, 3300, 1073745131, // NOLINT + 3310, 1073745138, 3315, 1073745152, + 3365, 3367, 3373, 1073745200, // NOLINT + 3431, 3439, 1073745280, 3478, + 1073745312, 3494, 1073745320, 3502, // NOLINT + 1073745328, 3510, 1073745336, 3518, + 1073745344, 3526, 1073745352, 3534, // NOLINT + 1073745360, 3542, 1073745368, 3550, + 3631, 1073745925, 4103, 1073745953, // NOLINT + 4137, 1073745969, 4149, 1073745976, + 4156, 1073745985, 4246, 1073746077, // NOLINT + 4255, 1073746081, 4346, 1073746172, + 4351, 1073746181, 4397, 1073746225, // NOLINT + 4494, 1073746336, 4538, 1073746416, + 4607, 1073746944, 8191}; // NOLINT +static const uint16_t kLetterTable2Size = 4; +static const int32_t kLetterTable2[4] = {1073741824, 3509, 1073745408, + 8191}; // NOLINT +static const uint16_t kLetterTable3Size = 2; +static const int32_t kLetterTable3[2] = {1073741824, 8191}; // NOLINT +static const uint16_t kLetterTable4Size = 2; +static const int32_t kLetterTable4[2] = {1073741824, 8140}; // NOLINT +static const uint16_t kLetterTable5Size = 100; +static const int32_t kLetterTable5[100] = { + 1073741824, 1164, 1073743056, 1277, + 1073743104, 1548, 1073743376, 1567, // NOLINT + 1073743402, 1579, 1073743424, 1646, + 1073743487, 1693, 1073743520, 1775, // NOLINT + 1073743639, 1823, 1073743650, 1928, + 1073743755, 1934, 1073743760, 1965, // NOLINT + 1073743792, 1969, 1073743863, 2049, + 1073743875, 2053, 1073743879, 2058, // NOLINT + 1073743884, 2082, 1073743936, 2163, + 1073744002, 2227, 1073744114, 2295, // NOLINT + 2299, 1073744138, 2341, 1073744176, + 2374, 1073744224, 2428, 1073744260, // NOLINT + 2482, 2511, 1073744352, 2532, + 1073744358, 2543, 1073744378, 2558, // NOLINT + 1073744384, 2600, 1073744448, 2626, + 1073744452, 2635, 1073744480, 2678, // NOLINT + 2682, 1073744510, 2735, 2737, + 1073744565, 2742, 1073744569, 2749, // NOLINT + 2752, 2754, 1073744603, 2781, + 1073744608, 2794, 1073744626, 2804, // NOLINT + 1073744641, 2822, 1073744649, 2830, + 1073744657, 2838, 1073744672, 2854, // NOLINT + 1073744680, 2862, 1073744688, 2906, + 1073744732, 2911, 1073744740, 2917, // NOLINT + 1073744832, 3042, 1073744896, 8191}; // NOLINT +static const uint16_t kLetterTable6Size = 6; +static const int32_t kLetterTable6[6] = {1073741824, 6051, 1073747888, 6086, + 1073747915, 6139}; // NOLINT +static const uint16_t kLetterTable7Size = 48; +static const int32_t kLetterTable7[48] = { + 1073748224, 6765, 1073748592, 6873, + 1073748736, 6918, 1073748755, 6935, // NOLINT + 6941, 1073748767, 6952, 1073748778, + 6966, 1073748792, 6972, 6974, // NOLINT + 1073748800, 6977, 1073748803, 6980, + 1073748806, 7089, 1073748947, 7485, // NOLINT + 1073749328, 7567, 1073749394, 7623, + 1073749488, 7675, 1073749616, 7796, // NOLINT + 1073749622, 7932, 1073749793, 7994, + 1073749825, 8026, 1073749862, 8126, // NOLINT + 1073749954, 8135, 1073749962, 8143, + 1073749970, 8151, 1073749978, 8156}; // NOLINT +bool Letter::Is(uchar c) { + int chunk_index = c >> 13; + switch (chunk_index) { + case 0: + return LookupPredicate(kLetterTable0, kLetterTable0Size, c); + case 1: + return LookupPredicate(kLetterTable1, kLetterTable1Size, c); + case 2: + return LookupPredicate(kLetterTable2, kLetterTable2Size, c); + case 3: + return LookupPredicate(kLetterTable3, kLetterTable3Size, c); + case 4: + return LookupPredicate(kLetterTable4, kLetterTable4Size, c); + case 5: + return LookupPredicate(kLetterTable5, kLetterTable5Size, c); + case 6: + return LookupPredicate(kLetterTable6, kLetterTable6Size, c); + case 7: + return LookupPredicate(kLetterTable7, kLetterTable7Size, c); + default: + return false; + } +} +#endif + +#ifndef V8_INTL_SUPPORT + +static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings0[1] = + { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262CanonicalizeTable0Size = 498; // NOLINT +static const int32_t kEcma262CanonicalizeTable0[996] = { + 1073741921, -128, 122, -128, 181, 2972, + 1073742048, -128, 246, -128, 1073742072, -128, + 254, -128, 255, 484, // NOLINT + 257, -4, 259, -4, 261, -4, + 263, -4, 265, -4, 267, -4, + 269, -4, 271, -4, // NOLINT + 273, -4, 275, -4, 277, -4, + 279, -4, 281, -4, 283, -4, + 285, -4, 287, -4, // NOLINT + 289, -4, 291, -4, 293, -4, + 295, -4, 297, -4, 299, -4, + 301, -4, 303, -4, // NOLINT + 307, -4, 309, -4, 311, -4, + 314, -4, 316, -4, 318, -4, + 320, -4, 322, -4, // NOLINT + 324, -4, 326, -4, 328, -4, + 331, -4, 333, -4, 335, -4, + 337, -4, 339, -4, // NOLINT + 341, -4, 343, -4, 345, -4, + 347, -4, 349, -4, 351, -4, + 353, -4, 355, -4, // NOLINT + 357, -4, 359, -4, 361, -4, + 363, -4, 365, -4, 367, -4, + 369, -4, 371, -4, // NOLINT + 373, -4, 375, -4, 378, -4, + 380, -4, 382, -4, 384, 780, + 387, -4, 389, -4, // NOLINT + 392, -4, 396, -4, 402, -4, + 405, 388, 409, -4, 410, 652, + 414, 520, 417, -4, // NOLINT + 419, -4, 421, -4, 424, -4, + 429, -4, 432, -4, 436, -4, + 438, -4, 441, -4, // NOLINT + 445, -4, 447, 224, 453, -4, + 454, -8, 456, -4, 457, -8, + 459, -4, 460, -8, // NOLINT + 462, -4, 464, -4, 466, -4, + 468, -4, 470, -4, 472, -4, + 474, -4, 476, -4, // NOLINT + 477, -316, 479, -4, 481, -4, + 483, -4, 485, -4, 487, -4, + 489, -4, 491, -4, // NOLINT + 493, -4, 495, -4, 498, -4, + 499, -8, 501, -4, 505, -4, + 507, -4, 509, -4, // NOLINT + 511, -4, 513, -4, 515, -4, + 517, -4, 519, -4, 521, -4, + 523, -4, 525, -4, // NOLINT + 527, -4, 529, -4, 531, -4, + 533, -4, 535, -4, 537, -4, + 539, -4, 541, -4, // NOLINT + 543, -4, 547, -4, 549, -4, + 551, -4, 553, -4, 555, -4, + 557, -4, 559, -4, // NOLINT + 561, -4, 563, -4, 572, -4, + 1073742399, 43260, 576, 43260, 578, -4, + 583, -4, 585, -4, // NOLINT + 587, -4, 589, -4, 591, -4, + 592, 43132, 593, 43120, 594, 43128, + 595, -840, 596, -824, // NOLINT + 1073742422, -820, 599, -820, 601, -808, + 603, -812, 604, 169276, 608, -820, + 609, 169260, 611, -828, // NOLINT + 613, 169120, 614, 169232, 616, -836, + 617, -844, 619, 42972, 620, 169220, + 623, -844, 625, 42996, // NOLINT + 626, -852, 629, -856, 637, 42908, + 640, -872, 643, -872, 647, 169128, + 648, -872, 649, -276, // NOLINT + 1073742474, -868, 651, -868, 652, -284, + 658, -876, 670, 169032, 837, 336, + 881, -4, 883, -4, // NOLINT + 887, -4, 1073742715, 520, 893, 520, + 940, -152, 1073742765, -148, 943, -148, + 1073742769, -128, 961, -128, // NOLINT + 962, -124, 1073742787, -128, 971, -128, + 972, -256, 1073742797, -252, 974, -252, + 976, -248, 977, -228, // NOLINT + 981, -188, 982, -216, 983, -32, + 985, -4, 987, -4, 989, -4, + 991, -4, 993, -4, // NOLINT + 995, -4, 997, -4, 999, -4, + 1001, -4, 1003, -4, 1005, -4, + 1007, -4, 1008, -344, // NOLINT + 1009, -320, 1010, 28, 1011, -464, + 1013, -384, 1016, -4, 1019, -4, + 1073742896, -128, 1103, -128, // NOLINT + 1073742928, -320, 1119, -320, 1121, -4, + 1123, -4, 1125, -4, 1127, -4, + 1129, -4, 1131, -4, // NOLINT + 1133, -4, 1135, -4, 1137, -4, + 1139, -4, 1141, -4, 1143, -4, + 1145, -4, 1147, -4, // NOLINT + 1149, -4, 1151, -4, 1153, -4, + 1163, -4, 1165, -4, 1167, -4, + 1169, -4, 1171, -4, // NOLINT + 1173, -4, 1175, -4, 1177, -4, + 1179, -4, 1181, -4, 1183, -4, + 1185, -4, 1187, -4, // NOLINT + 1189, -4, 1191, -4, 1193, -4, + 1195, -4, 1197, -4, 1199, -4, + 1201, -4, 1203, -4, // NOLINT + 1205, -4, 1207, -4, 1209, -4, + 1211, -4, 1213, -4, 1215, -4, + 1218, -4, 1220, -4, // NOLINT + 1222, -4, 1224, -4, 1226, -4, + 1228, -4, 1230, -4, 1231, -60, + 1233, -4, 1235, -4, // NOLINT + 1237, -4, 1239, -4, 1241, -4, + 1243, -4, 1245, -4, 1247, -4, + 1249, -4, 1251, -4, // NOLINT + 1253, -4, 1255, -4, 1257, -4, + 1259, -4, 1261, -4, 1263, -4, + 1265, -4, 1267, -4, // NOLINT + 1269, -4, 1271, -4, 1273, -4, + 1275, -4, 1277, -4, 1279, -4, + 1281, -4, 1283, -4, // NOLINT + 1285, -4, 1287, -4, 1289, -4, + 1291, -4, 1293, -4, 1295, -4, + 1297, -4, 1299, -4, // NOLINT + 1301, -4, 1303, -4, 1305, -4, + 1307, -4, 1309, -4, 1311, -4, + 1313, -4, 1315, -4, // NOLINT + 1317, -4, 1319, -4, 1321, -4, + 1323, -4, 1325, -4, 1327, -4, + 1073743201, -192, 1414, -192, // NOLINT + 7545, 141328, 7549, 15256, 7681, -4, + 7683, -4, 7685, -4, 7687, -4, + 7689, -4, 7691, -4, // NOLINT + 7693, -4, 7695, -4, 7697, -4, + 7699, -4, 7701, -4, 7703, -4, + 7705, -4, 7707, -4, // NOLINT + 7709, -4, 7711, -4, 7713, -4, + 7715, -4, 7717, -4, 7719, -4, + 7721, -4, 7723, -4, // NOLINT + 7725, -4, 7727, -4, 7729, -4, + 7731, -4, 7733, -4, 7735, -4, + 7737, -4, 7739, -4, // NOLINT + 7741, -4, 7743, -4, 7745, -4, + 7747, -4, 7749, -4, 7751, -4, + 7753, -4, 7755, -4, // NOLINT + 7757, -4, 7759, -4, 7761, -4, + 7763, -4, 7765, -4, 7767, -4, + 7769, -4, 7771, -4, // NOLINT + 7773, -4, 7775, -4, 7777, -4, + 7779, -4, 7781, -4, 7783, -4, + 7785, -4, 7787, -4, // NOLINT + 7789, -4, 7791, -4, 7793, -4, + 7795, -4, 7797, -4, 7799, -4, + 7801, -4, 7803, -4, // NOLINT + 7805, -4, 7807, -4, 7809, -4, + 7811, -4, 7813, -4, 7815, -4, + 7817, -4, 7819, -4, // NOLINT + 7821, -4, 7823, -4, 7825, -4, + 7827, -4, 7829, -4, 7835, -236, + 7841, -4, 7843, -4, // NOLINT + 7845, -4, 7847, -4, 7849, -4, + 7851, -4, 7853, -4, 7855, -4, + 7857, -4, 7859, -4, // NOLINT + 7861, -4, 7863, -4, 7865, -4, + 7867, -4, 7869, -4, 7871, -4, + 7873, -4, 7875, -4, // NOLINT + 7877, -4, 7879, -4, 7881, -4, + 7883, -4, 7885, -4, 7887, -4, + 7889, -4, 7891, -4, // NOLINT + 7893, -4, 7895, -4, 7897, -4, + 7899, -4, 7901, -4, 7903, -4, + 7905, -4, 7907, -4, // NOLINT + 7909, -4, 7911, -4, 7913, -4, + 7915, -4, 7917, -4, 7919, -4, + 7921, -4, 7923, -4, // NOLINT + 7925, -4, 7927, -4, 7929, -4, + 7931, -4, 7933, -4, 7935, -4, + 1073749760, 32, 7943, 32, // NOLINT + 1073749776, 32, 7957, 32, 1073749792, 32, + 7975, 32, 1073749808, 32, 7991, 32, + 1073749824, 32, 8005, 32, // NOLINT + 8017, 32, 8019, 32, 8021, 32, + 8023, 32, 1073749856, 32, 8039, 32, + 1073749872, 296, 8049, 296, // NOLINT + 1073749874, 344, 8053, 344, 1073749878, 400, + 8055, 400, 1073749880, 512, 8057, 512, + 1073749882, 448, 8059, 448, // NOLINT + 1073749884, 504, 8061, 504, 1073749936, 32, + 8113, 32, 8126, -28820, 1073749968, 32, + 8145, 32, 1073749984, 32, // NOLINT + 8161, 32, 8165, 28}; // NOLINT +static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings1[1] = + { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262CanonicalizeTable1Size = 73; // NOLINT +static const int32_t kEcma262CanonicalizeTable1[146] = { + 334, -112, 1073742192, -64, 383, -64, + 388, -4, 1073743056, -104, 1257, -104, + 1073744944, -192, 3166, -192, // NOLINT + 3169, -4, 3173, -43180, 3174, -43168, + 3176, -4, 3178, -4, 3180, -4, + 3187, -4, 3190, -4, // NOLINT + 3201, -4, 3203, -4, 3205, -4, + 3207, -4, 3209, -4, 3211, -4, + 3213, -4, 3215, -4, // NOLINT + 3217, -4, 3219, -4, 3221, -4, + 3223, -4, 3225, -4, 3227, -4, + 3229, -4, 3231, -4, // NOLINT + 3233, -4, 3235, -4, 3237, -4, + 3239, -4, 3241, -4, 3243, -4, + 3245, -4, 3247, -4, // NOLINT + 3249, -4, 3251, -4, 3253, -4, + 3255, -4, 3257, -4, 3259, -4, + 3261, -4, 3263, -4, // NOLINT + 3265, -4, 3267, -4, 3269, -4, + 3271, -4, 3273, -4, 3275, -4, + 3277, -4, 3279, -4, // NOLINT + 3281, -4, 3283, -4, 3285, -4, + 3287, -4, 3289, -4, 3291, -4, + 3293, -4, 3295, -4, // NOLINT + 3297, -4, 3299, -4, 3308, -4, + 3310, -4, 3315, -4, 1073745152, -29056, + 3365, -29056, 3367, -29056, // NOLINT + 3373, -29056}; // NOLINT +static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings5[1] = + { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262CanonicalizeTable5Size = 95; // NOLINT +static const int32_t kEcma262CanonicalizeTable5[190] = + { + 1601, -4, 1603, -4, 1605, -4, 1607, -4, + 1609, -4, 1611, -4, 1613, -4, 1615, -4, // NOLINT + 1617, -4, 1619, -4, 1621, -4, 1623, -4, + 1625, -4, 1627, -4, 1629, -4, 1631, -4, // NOLINT + 1633, -4, 1635, -4, 1637, -4, 1639, -4, + 1641, -4, 1643, -4, 1645, -4, 1665, -4, // NOLINT + 1667, -4, 1669, -4, 1671, -4, 1673, -4, + 1675, -4, 1677, -4, 1679, -4, 1681, -4, // NOLINT + 1683, -4, 1685, -4, 1687, -4, 1689, -4, + 1691, -4, 1827, -4, 1829, -4, 1831, -4, // NOLINT + 1833, -4, 1835, -4, 1837, -4, 1839, -4, + 1843, -4, 1845, -4, 1847, -4, 1849, -4, // NOLINT + 1851, -4, 1853, -4, 1855, -4, 1857, -4, + 1859, -4, 1861, -4, 1863, -4, 1865, -4, // NOLINT + 1867, -4, 1869, -4, 1871, -4, 1873, -4, + 1875, -4, 1877, -4, 1879, -4, 1881, -4, // NOLINT + 1883, -4, 1885, -4, 1887, -4, 1889, -4, + 1891, -4, 1893, -4, 1895, -4, 1897, -4, // NOLINT + 1899, -4, 1901, -4, 1903, -4, 1914, -4, + 1916, -4, 1919, -4, 1921, -4, 1923, -4, // NOLINT + 1925, -4, 1927, -4, 1932, -4, 1937, -4, + 1939, -4, 1943, -4, 1945, -4, 1947, -4, // NOLINT + 1949, -4, 1951, -4, 1953, -4, 1955, -4, + 1957, -4, 1959, -4, 1961, -4}; // NOLINT +static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings7[1] = + { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262CanonicalizeTable7Size = 2; // NOLINT +static const int32_t kEcma262CanonicalizeTable7[4] = {1073749825, -128, 8026, + -128}; // NOLINT +int Ecma262Canonicalize::Convert(uchar c, uchar n, uchar* result, + bool* allow_caching_ptr) { + int chunk_index = c >> 13; + switch (chunk_index) { + case 0: + return LookupMapping<true>( + kEcma262CanonicalizeTable0, kEcma262CanonicalizeTable0Size, + kEcma262CanonicalizeMultiStrings0, c, n, result, allow_caching_ptr); + case 1: + return LookupMapping<true>( + kEcma262CanonicalizeTable1, kEcma262CanonicalizeTable1Size, + kEcma262CanonicalizeMultiStrings1, c, n, result, allow_caching_ptr); + case 5: + return LookupMapping<true>( + kEcma262CanonicalizeTable5, kEcma262CanonicalizeTable5Size, + kEcma262CanonicalizeMultiStrings5, c, n, result, allow_caching_ptr); + case 7: + return LookupMapping<true>( + kEcma262CanonicalizeTable7, kEcma262CanonicalizeTable7Size, + kEcma262CanonicalizeMultiStrings7, c, n, result, allow_caching_ptr); + default: + return 0; + } +} + +static const MultiCharacterSpecialCase<4> + kEcma262UnCanonicalizeMultiStrings0[507] = { // NOLINT + {{65, 97, kSentinel}}, + {{90, 122, kSentinel}}, + {{181, 924, 956, kSentinel}}, + {{192, 224, kSentinel}}, // NOLINT + {{214, 246, kSentinel}}, + {{216, 248, kSentinel}}, + {{222, 254, kSentinel}}, + {{255, 376, kSentinel}}, // NOLINT + {{256, 257, kSentinel}}, + {{258, 259, kSentinel}}, + {{260, 261, kSentinel}}, + {{262, 263, kSentinel}}, // NOLINT + {{264, 265, kSentinel}}, + {{266, 267, kSentinel}}, + {{268, 269, kSentinel}}, + {{270, 271, kSentinel}}, // NOLINT + {{272, 273, kSentinel}}, + {{274, 275, kSentinel}}, + {{276, 277, kSentinel}}, + {{278, 279, kSentinel}}, // NOLINT + {{280, 281, kSentinel}}, + {{282, 283, kSentinel}}, + {{284, 285, kSentinel}}, + {{286, 287, kSentinel}}, // NOLINT + {{288, 289, kSentinel}}, + {{290, 291, kSentinel}}, + {{292, 293, kSentinel}}, + {{294, 295, kSentinel}}, // NOLINT + {{296, 297, kSentinel}}, + {{298, 299, kSentinel}}, + {{300, 301, kSentinel}}, + {{302, 303, kSentinel}}, // NOLINT + {{306, 307, kSentinel}}, + {{308, 309, kSentinel}}, + {{310, 311, kSentinel}}, + {{313, 314, kSentinel}}, // NOLINT + {{315, 316, kSentinel}}, + {{317, 318, kSentinel}}, + {{319, 320, kSentinel}}, + {{321, 322, kSentinel}}, // NOLINT + {{323, 324, kSentinel}}, + {{325, 326, kSentinel}}, + {{327, 328, kSentinel}}, + {{330, 331, kSentinel}}, // NOLINT + {{332, 333, kSentinel}}, + {{334, 335, kSentinel}}, + {{336, 337, kSentinel}}, + {{338, 339, kSentinel}}, // NOLINT + {{340, 341, kSentinel}}, + {{342, 343, kSentinel}}, + {{344, 345, kSentinel}}, + {{346, 347, kSentinel}}, // NOLINT + {{348, 349, kSentinel}}, + {{350, 351, kSentinel}}, + {{352, 353, kSentinel}}, + {{354, 355, kSentinel}}, // NOLINT + {{356, 357, kSentinel}}, + {{358, 359, kSentinel}}, + {{360, 361, kSentinel}}, + {{362, 363, kSentinel}}, // NOLINT + {{364, 365, kSentinel}}, + {{366, 367, kSentinel}}, + {{368, 369, kSentinel}}, + {{370, 371, kSentinel}}, // NOLINT + {{372, 373, kSentinel}}, + {{374, 375, kSentinel}}, + {{377, 378, kSentinel}}, + {{379, 380, kSentinel}}, // NOLINT + {{381, 382, kSentinel}}, + {{384, 579, kSentinel}}, + {{385, 595, kSentinel}}, + {{386, 387, kSentinel}}, // NOLINT + {{388, 389, kSentinel}}, + {{390, 596, kSentinel}}, + {{391, 392, kSentinel}}, + {{393, 598, kSentinel}}, // NOLINT + {{394, 599, kSentinel}}, + {{395, 396, kSentinel}}, + {{398, 477, kSentinel}}, + {{399, 601, kSentinel}}, // NOLINT + {{400, 603, kSentinel}}, + {{401, 402, kSentinel}}, + {{403, 608, kSentinel}}, + {{404, 611, kSentinel}}, // NOLINT + {{405, 502, kSentinel}}, + {{406, 617, kSentinel}}, + {{407, 616, kSentinel}}, + {{408, 409, kSentinel}}, // NOLINT + {{410, 573, kSentinel}}, + {{412, 623, kSentinel}}, + {{413, 626, kSentinel}}, + {{414, 544, kSentinel}}, // NOLINT + {{415, 629, kSentinel}}, + {{416, 417, kSentinel}}, + {{418, 419, kSentinel}}, + {{420, 421, kSentinel}}, // NOLINT + {{422, 640, kSentinel}}, + {{423, 424, kSentinel}}, + {{425, 643, kSentinel}}, + {{428, 429, kSentinel}}, // NOLINT + {{430, 648, kSentinel}}, + {{431, 432, kSentinel}}, + {{433, 650, kSentinel}}, + {{434, 651, kSentinel}}, // NOLINT + {{435, 436, kSentinel}}, + {{437, 438, kSentinel}}, + {{439, 658, kSentinel}}, + {{440, 441, kSentinel}}, // NOLINT + {{444, 445, kSentinel}}, + {{447, 503, kSentinel}}, + {{452, 453, 454, kSentinel}}, + {{455, 456, 457, kSentinel}}, // NOLINT + {{458, 459, 460, kSentinel}}, + {{461, 462, kSentinel}}, + {{463, 464, kSentinel}}, + {{465, 466, kSentinel}}, // NOLINT + {{467, 468, kSentinel}}, + {{469, 470, kSentinel}}, + {{471, 472, kSentinel}}, + {{473, 474, kSentinel}}, // NOLINT + {{475, 476, kSentinel}}, + {{478, 479, kSentinel}}, + {{480, 481, kSentinel}}, + {{482, 483, kSentinel}}, // NOLINT + {{484, 485, kSentinel}}, + {{486, 487, kSentinel}}, + {{488, 489, kSentinel}}, + {{490, 491, kSentinel}}, // NOLINT + {{492, 493, kSentinel}}, + {{494, 495, kSentinel}}, + {{497, 498, 499, kSentinel}}, + {{500, 501, kSentinel}}, // NOLINT + {{504, 505, kSentinel}}, + {{506, 507, kSentinel}}, + {{508, 509, kSentinel}}, + {{510, 511, kSentinel}}, // NOLINT + {{512, 513, kSentinel}}, + {{514, 515, kSentinel}}, + {{516, 517, kSentinel}}, + {{518, 519, kSentinel}}, // NOLINT + {{520, 521, kSentinel}}, + {{522, 523, kSentinel}}, + {{524, 525, kSentinel}}, + {{526, 527, kSentinel}}, // NOLINT + {{528, 529, kSentinel}}, + {{530, 531, kSentinel}}, + {{532, 533, kSentinel}}, + {{534, 535, kSentinel}}, // NOLINT + {{536, 537, kSentinel}}, + {{538, 539, kSentinel}}, + {{540, 541, kSentinel}}, + {{542, 543, kSentinel}}, // NOLINT + {{546, 547, kSentinel}}, + {{548, 549, kSentinel}}, + {{550, 551, kSentinel}}, + {{552, 553, kSentinel}}, // NOLINT + {{554, 555, kSentinel}}, + {{556, 557, kSentinel}}, + {{558, 559, kSentinel}}, + {{560, 561, kSentinel}}, // NOLINT + {{562, 563, kSentinel}}, + {{570, 11365, kSentinel}}, + {{571, 572, kSentinel}}, + {{574, 11366, kSentinel}}, // NOLINT + {{575, 11390, kSentinel}}, + {{576, 11391, kSentinel}}, + {{577, 578, kSentinel}}, + {{580, 649, kSentinel}}, // NOLINT + {{581, 652, kSentinel}}, + {{582, 583, kSentinel}}, + {{584, 585, kSentinel}}, + {{586, 587, kSentinel}}, // NOLINT + {{588, 589, kSentinel}}, + {{590, 591, kSentinel}}, + {{592, 11375, kSentinel}}, + {{593, 11373, kSentinel}}, // NOLINT + {{594, 11376, kSentinel}}, + {{604, 42923, kSentinel}}, + {{609, 42924, kSentinel}}, + {{613, 42893, kSentinel}}, // NOLINT + {{614, 42922, kSentinel}}, + {{619, 11362, kSentinel}}, + {{620, 42925, kSentinel}}, + {{625, 11374, kSentinel}}, // NOLINT + {{637, 11364, kSentinel}}, + {{647, 42929, kSentinel}}, + {{670, 42928, kSentinel}}, + {{837, 921, 953, 8126}}, // NOLINT + {{880, 881, kSentinel}}, + {{882, 883, kSentinel}}, + {{886, 887, kSentinel}}, + {{891, 1021, kSentinel}}, // NOLINT + {{893, 1023, kSentinel}}, + {{895, 1011, kSentinel}}, + {{902, 940, kSentinel}}, + {{904, 941, kSentinel}}, // NOLINT + {{906, 943, kSentinel}}, + {{908, 972, kSentinel}}, + {{910, 973, kSentinel}}, + {{911, 974, kSentinel}}, // NOLINT + {{913, 945, kSentinel}}, + {{914, 946, 976, kSentinel}}, + {{915, 947, kSentinel}}, + {{916, 948, kSentinel}}, // NOLINT + {{917, 949, 1013, kSentinel}}, + {{918, 950, kSentinel}}, + {{919, 951, kSentinel}}, + {{920, 952, 977, kSentinel}}, // NOLINT + {{922, 954, 1008, kSentinel}}, + {{923, 955, kSentinel}}, + {{925, 957, kSentinel}}, + {{927, 959, kSentinel}}, // NOLINT + {{928, 960, 982, kSentinel}}, + {{929, 961, 1009, kSentinel}}, + {{931, 962, 963, kSentinel}}, + {{932, 964, kSentinel}}, // NOLINT + {{933, 965, kSentinel}}, + {{934, 966, 981, kSentinel}}, + {{935, 967, kSentinel}}, + {{939, 971, kSentinel}}, // NOLINT + {{975, 983, kSentinel}}, + {{984, 985, kSentinel}}, + {{986, 987, kSentinel}}, + {{988, 989, kSentinel}}, // NOLINT + {{990, 991, kSentinel}}, + {{992, 993, kSentinel}}, + {{994, 995, kSentinel}}, + {{996, 997, kSentinel}}, // NOLINT + {{998, 999, kSentinel}}, + {{1000, 1001, kSentinel}}, + {{1002, 1003, kSentinel}}, + {{1004, 1005, kSentinel}}, // NOLINT + {{1006, 1007, kSentinel}}, + {{1010, 1017, kSentinel}}, + {{1015, 1016, kSentinel}}, + {{1018, 1019, kSentinel}}, // NOLINT + {{1024, 1104, kSentinel}}, + {{1039, 1119, kSentinel}}, + {{1040, 1072, kSentinel}}, + {{1071, 1103, kSentinel}}, // NOLINT + {{1120, 1121, kSentinel}}, + {{1122, 1123, kSentinel}}, + {{1124, 1125, kSentinel}}, + {{1126, 1127, kSentinel}}, // NOLINT + {{1128, 1129, kSentinel}}, + {{1130, 1131, kSentinel}}, + {{1132, 1133, kSentinel}}, + {{1134, 1135, kSentinel}}, // NOLINT + {{1136, 1137, kSentinel}}, + {{1138, 1139, kSentinel}}, + {{1140, 1141, kSentinel}}, + {{1142, 1143, kSentinel}}, // NOLINT + {{1144, 1145, kSentinel}}, + {{1146, 1147, kSentinel}}, + {{1148, 1149, kSentinel}}, + {{1150, 1151, kSentinel}}, // NOLINT + {{1152, 1153, kSentinel}}, + {{1162, 1163, kSentinel}}, + {{1164, 1165, kSentinel}}, + {{1166, 1167, kSentinel}}, // NOLINT + {{1168, 1169, kSentinel}}, + {{1170, 1171, kSentinel}}, + {{1172, 1173, kSentinel}}, + {{1174, 1175, kSentinel}}, // NOLINT + {{1176, 1177, kSentinel}}, + {{1178, 1179, kSentinel}}, + {{1180, 1181, kSentinel}}, + {{1182, 1183, kSentinel}}, // NOLINT + {{1184, 1185, kSentinel}}, + {{1186, 1187, kSentinel}}, + {{1188, 1189, kSentinel}}, + {{1190, 1191, kSentinel}}, // NOLINT + {{1192, 1193, kSentinel}}, + {{1194, 1195, kSentinel}}, + {{1196, 1197, kSentinel}}, + {{1198, 1199, kSentinel}}, // NOLINT + {{1200, 1201, kSentinel}}, + {{1202, 1203, kSentinel}}, + {{1204, 1205, kSentinel}}, + {{1206, 1207, kSentinel}}, // NOLINT + {{1208, 1209, kSentinel}}, + {{1210, 1211, kSentinel}}, + {{1212, 1213, kSentinel}}, + {{1214, 1215, kSentinel}}, // NOLINT + {{1216, 1231, kSentinel}}, + {{1217, 1218, kSentinel}}, + {{1219, 1220, kSentinel}}, + {{1221, 1222, kSentinel}}, // NOLINT + {{1223, 1224, kSentinel}}, + {{1225, 1226, kSentinel}}, + {{1227, 1228, kSentinel}}, + {{1229, 1230, kSentinel}}, // NOLINT + {{1232, 1233, kSentinel}}, + {{1234, 1235, kSentinel}}, + {{1236, 1237, kSentinel}}, + {{1238, 1239, kSentinel}}, // NOLINT + {{1240, 1241, kSentinel}}, + {{1242, 1243, kSentinel}}, + {{1244, 1245, kSentinel}}, + {{1246, 1247, kSentinel}}, // NOLINT + {{1248, 1249, kSentinel}}, + {{1250, 1251, kSentinel}}, + {{1252, 1253, kSentinel}}, + {{1254, 1255, kSentinel}}, // NOLINT + {{1256, 1257, kSentinel}}, + {{1258, 1259, kSentinel}}, + {{1260, 1261, kSentinel}}, + {{1262, 1263, kSentinel}}, // NOLINT + {{1264, 1265, kSentinel}}, + {{1266, 1267, kSentinel}}, + {{1268, 1269, kSentinel}}, + {{1270, 1271, kSentinel}}, // NOLINT + {{1272, 1273, kSentinel}}, + {{1274, 1275, kSentinel}}, + {{1276, 1277, kSentinel}}, + {{1278, 1279, kSentinel}}, // NOLINT + {{1280, 1281, kSentinel}}, + {{1282, 1283, kSentinel}}, + {{1284, 1285, kSentinel}}, + {{1286, 1287, kSentinel}}, // NOLINT + {{1288, 1289, kSentinel}}, + {{1290, 1291, kSentinel}}, + {{1292, 1293, kSentinel}}, + {{1294, 1295, kSentinel}}, // NOLINT + {{1296, 1297, kSentinel}}, + {{1298, 1299, kSentinel}}, + {{1300, 1301, kSentinel}}, + {{1302, 1303, kSentinel}}, // NOLINT + {{1304, 1305, kSentinel}}, + {{1306, 1307, kSentinel}}, + {{1308, 1309, kSentinel}}, + {{1310, 1311, kSentinel}}, // NOLINT + {{1312, 1313, kSentinel}}, + {{1314, 1315, kSentinel}}, + {{1316, 1317, kSentinel}}, + {{1318, 1319, kSentinel}}, // NOLINT + {{1320, 1321, kSentinel}}, + {{1322, 1323, kSentinel}}, + {{1324, 1325, kSentinel}}, + {{1326, 1327, kSentinel}}, // NOLINT + {{1329, 1377, kSentinel}}, + {{1366, 1414, kSentinel}}, + {{4256, 11520, kSentinel}}, + {{4293, 11557, kSentinel}}, // NOLINT + {{4295, 11559, kSentinel}}, + {{4301, 11565, kSentinel}}, + {{7545, 42877, kSentinel}}, + {{7549, 11363, kSentinel}}, // NOLINT + {{7680, 7681, kSentinel}}, + {{7682, 7683, kSentinel}}, + {{7684, 7685, kSentinel}}, + {{7686, 7687, kSentinel}}, // NOLINT + {{7688, 7689, kSentinel}}, + {{7690, 7691, kSentinel}}, + {{7692, 7693, kSentinel}}, + {{7694, 7695, kSentinel}}, // NOLINT + {{7696, 7697, kSentinel}}, + {{7698, 7699, kSentinel}}, + {{7700, 7701, kSentinel}}, + {{7702, 7703, kSentinel}}, // NOLINT + {{7704, 7705, kSentinel}}, + {{7706, 7707, kSentinel}}, + {{7708, 7709, kSentinel}}, + {{7710, 7711, kSentinel}}, // NOLINT + {{7712, 7713, kSentinel}}, + {{7714, 7715, kSentinel}}, + {{7716, 7717, kSentinel}}, + {{7718, 7719, kSentinel}}, // NOLINT + {{7720, 7721, kSentinel}}, + {{7722, 7723, kSentinel}}, + {{7724, 7725, kSentinel}}, + {{7726, 7727, kSentinel}}, // NOLINT + {{7728, 7729, kSentinel}}, + {{7730, 7731, kSentinel}}, + {{7732, 7733, kSentinel}}, + {{7734, 7735, kSentinel}}, // NOLINT + {{7736, 7737, kSentinel}}, + {{7738, 7739, kSentinel}}, + {{7740, 7741, kSentinel}}, + {{7742, 7743, kSentinel}}, // NOLINT + {{7744, 7745, kSentinel}}, + {{7746, 7747, kSentinel}}, + {{7748, 7749, kSentinel}}, + {{7750, 7751, kSentinel}}, // NOLINT + {{7752, 7753, kSentinel}}, + {{7754, 7755, kSentinel}}, + {{7756, 7757, kSentinel}}, + {{7758, 7759, kSentinel}}, // NOLINT + {{7760, 7761, kSentinel}}, + {{7762, 7763, kSentinel}}, + {{7764, 7765, kSentinel}}, + {{7766, 7767, kSentinel}}, // NOLINT + {{7768, 7769, kSentinel}}, + {{7770, 7771, kSentinel}}, + {{7772, 7773, kSentinel}}, + {{7774, 7775, kSentinel}}, // NOLINT + {{7776, 7777, 7835, kSentinel}}, + {{7778, 7779, kSentinel}}, + {{7780, 7781, kSentinel}}, + {{7782, 7783, kSentinel}}, // NOLINT + {{7784, 7785, kSentinel}}, + {{7786, 7787, kSentinel}}, + {{7788, 7789, kSentinel}}, + {{7790, 7791, kSentinel}}, // NOLINT + {{7792, 7793, kSentinel}}, + {{7794, 7795, kSentinel}}, + {{7796, 7797, kSentinel}}, + {{7798, 7799, kSentinel}}, // NOLINT + {{7800, 7801, kSentinel}}, + {{7802, 7803, kSentinel}}, + {{7804, 7805, kSentinel}}, + {{7806, 7807, kSentinel}}, // NOLINT + {{7808, 7809, kSentinel}}, + {{7810, 7811, kSentinel}}, + {{7812, 7813, kSentinel}}, + {{7814, 7815, kSentinel}}, // NOLINT + {{7816, 7817, kSentinel}}, + {{7818, 7819, kSentinel}}, + {{7820, 7821, kSentinel}}, + {{7822, 7823, kSentinel}}, // NOLINT + {{7824, 7825, kSentinel}}, + {{7826, 7827, kSentinel}}, + {{7828, 7829, kSentinel}}, + {{7840, 7841, kSentinel}}, // NOLINT + {{7842, 7843, kSentinel}}, + {{7844, 7845, kSentinel}}, + {{7846, 7847, kSentinel}}, + {{7848, 7849, kSentinel}}, // NOLINT + {{7850, 7851, kSentinel}}, + {{7852, 7853, kSentinel}}, + {{7854, 7855, kSentinel}}, + {{7856, 7857, kSentinel}}, // NOLINT + {{7858, 7859, kSentinel}}, + {{7860, 7861, kSentinel}}, + {{7862, 7863, kSentinel}}, + {{7864, 7865, kSentinel}}, // NOLINT + {{7866, 7867, kSentinel}}, + {{7868, 7869, kSentinel}}, + {{7870, 7871, kSentinel}}, + {{7872, 7873, kSentinel}}, // NOLINT + {{7874, 7875, kSentinel}}, + {{7876, 7877, kSentinel}}, + {{7878, 7879, kSentinel}}, + {{7880, 7881, kSentinel}}, // NOLINT + {{7882, 7883, kSentinel}}, + {{7884, 7885, kSentinel}}, + {{7886, 7887, kSentinel}}, + {{7888, 7889, kSentinel}}, // NOLINT + {{7890, 7891, kSentinel}}, + {{7892, 7893, kSentinel}}, + {{7894, 7895, kSentinel}}, + {{7896, 7897, kSentinel}}, // NOLINT + {{7898, 7899, kSentinel}}, + {{7900, 7901, kSentinel}}, + {{7902, 7903, kSentinel}}, + {{7904, 7905, kSentinel}}, // NOLINT + {{7906, 7907, kSentinel}}, + {{7908, 7909, kSentinel}}, + {{7910, 7911, kSentinel}}, + {{7912, 7913, kSentinel}}, // NOLINT + {{7914, 7915, kSentinel}}, + {{7916, 7917, kSentinel}}, + {{7918, 7919, kSentinel}}, + {{7920, 7921, kSentinel}}, // NOLINT + {{7922, 7923, kSentinel}}, + {{7924, 7925, kSentinel}}, + {{7926, 7927, kSentinel}}, + {{7928, 7929, kSentinel}}, // NOLINT + {{7930, 7931, kSentinel}}, + {{7932, 7933, kSentinel}}, + {{7934, 7935, kSentinel}}, + {{7936, 7944, kSentinel}}, // NOLINT + {{7943, 7951, kSentinel}}, + {{7952, 7960, kSentinel}}, + {{7957, 7965, kSentinel}}, + {{7968, 7976, kSentinel}}, // NOLINT + {{7975, 7983, kSentinel}}, + {{7984, 7992, kSentinel}}, + {{7991, 7999, kSentinel}}, + {{8000, 8008, kSentinel}}, // NOLINT + {{8005, 8013, kSentinel}}, + {{8017, 8025, kSentinel}}, + {{8019, 8027, kSentinel}}, + {{8021, 8029, kSentinel}}, // NOLINT + {{8023, 8031, kSentinel}}, + {{8032, 8040, kSentinel}}, + {{8039, 8047, kSentinel}}, + {{8048, 8122, kSentinel}}, // NOLINT + {{8049, 8123, kSentinel}}, + {{8050, 8136, kSentinel}}, + {{8053, 8139, kSentinel}}, + {{8054, 8154, kSentinel}}, // NOLINT + {{8055, 8155, kSentinel}}, + {{8056, 8184, kSentinel}}, + {{8057, 8185, kSentinel}}, + {{8058, 8170, kSentinel}}, // NOLINT + {{8059, 8171, kSentinel}}, + {{8060, 8186, kSentinel}}, + {{8061, 8187, kSentinel}}, + {{8112, 8120, kSentinel}}, // NOLINT + {{8113, 8121, kSentinel}}, + {{8144, 8152, kSentinel}}, + {{8145, 8153, kSentinel}}, + {{8160, 8168, kSentinel}}, // NOLINT + {{8161, 8169, kSentinel}}, + {{8165, 8172, kSentinel}}, + {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262UnCanonicalizeTable0Size = 1005; // NOLINT +static const int32_t kEcma262UnCanonicalizeTable0[2010] = { + 1073741889, 1, 90, 5, 1073741921, 1, + 122, 5, 181, 9, 1073742016, 13, + 214, 17, 1073742040, 21, // NOLINT + 222, 25, 1073742048, 13, 246, 17, + 1073742072, 21, 254, 25, 255, 29, + 256, 33, 257, 33, // NOLINT + 258, 37, 259, 37, 260, 41, + 261, 41, 262, 45, 263, 45, + 264, 49, 265, 49, // NOLINT + 266, 53, 267, 53, 268, 57, + 269, 57, 270, 61, 271, 61, + 272, 65, 273, 65, // NOLINT + 274, 69, 275, 69, 276, 73, + 277, 73, 278, 77, 279, 77, + 280, 81, 281, 81, // NOLINT + 282, 85, 283, 85, 284, 89, + 285, 89, 286, 93, 287, 93, + 288, 97, 289, 97, // NOLINT + 290, 101, 291, 101, 292, 105, + 293, 105, 294, 109, 295, 109, + 296, 113, 297, 113, // NOLINT + 298, 117, 299, 117, 300, 121, + 301, 121, 302, 125, 303, 125, + 306, 129, 307, 129, // NOLINT + 308, 133, 309, 133, 310, 137, + 311, 137, 313, 141, 314, 141, + 315, 145, 316, 145, // NOLINT + 317, 149, 318, 149, 319, 153, + 320, 153, 321, 157, 322, 157, + 323, 161, 324, 161, // NOLINT + 325, 165, 326, 165, 327, 169, + 328, 169, 330, 173, 331, 173, + 332, 177, 333, 177, // NOLINT + 334, 181, 335, 181, 336, 185, + 337, 185, 338, 189, 339, 189, + 340, 193, 341, 193, // NOLINT + 342, 197, 343, 197, 344, 201, + 345, 201, 346, 205, 347, 205, + 348, 209, 349, 209, // NOLINT + 350, 213, 351, 213, 352, 217, + 353, 217, 354, 221, 355, 221, + 356, 225, 357, 225, // NOLINT + 358, 229, 359, 229, 360, 233, + 361, 233, 362, 237, 363, 237, + 364, 241, 365, 241, // NOLINT + 366, 245, 367, 245, 368, 249, + 369, 249, 370, 253, 371, 253, + 372, 257, 373, 257, // NOLINT + 374, 261, 375, 261, 376, 29, + 377, 265, 378, 265, 379, 269, + 380, 269, 381, 273, // NOLINT + 382, 273, 384, 277, 385, 281, + 386, 285, 387, 285, 388, 289, + 389, 289, 390, 293, // NOLINT + 391, 297, 392, 297, 1073742217, 301, + 394, 305, 395, 309, 396, 309, + 398, 313, 399, 317, // NOLINT + 400, 321, 401, 325, 402, 325, + 403, 329, 404, 333, 405, 337, + 406, 341, 407, 345, // NOLINT + 408, 349, 409, 349, 410, 353, + 412, 357, 413, 361, 414, 365, + 415, 369, 416, 373, // NOLINT + 417, 373, 418, 377, 419, 377, + 420, 381, 421, 381, 422, 385, + 423, 389, 424, 389, // NOLINT + 425, 393, 428, 397, 429, 397, + 430, 401, 431, 405, 432, 405, + 1073742257, 409, 434, 413, // NOLINT + 435, 417, 436, 417, 437, 421, + 438, 421, 439, 425, 440, 429, + 441, 429, 444, 433, // NOLINT + 445, 433, 447, 437, 452, 441, + 453, 441, 454, 441, 455, 445, + 456, 445, 457, 445, // NOLINT + 458, 449, 459, 449, 460, 449, + 461, 453, 462, 453, 463, 457, + 464, 457, 465, 461, // NOLINT + 466, 461, 467, 465, 468, 465, + 469, 469, 470, 469, 471, 473, + 472, 473, 473, 477, // NOLINT + 474, 477, 475, 481, 476, 481, + 477, 313, 478, 485, 479, 485, + 480, 489, 481, 489, // NOLINT + 482, 493, 483, 493, 484, 497, + 485, 497, 486, 501, 487, 501, + 488, 505, 489, 505, // NOLINT + 490, 509, 491, 509, 492, 513, + 493, 513, 494, 517, 495, 517, + 497, 521, 498, 521, // NOLINT + 499, 521, 500, 525, 501, 525, + 502, 337, 503, 437, 504, 529, + 505, 529, 506, 533, // NOLINT + 507, 533, 508, 537, 509, 537, + 510, 541, 511, 541, 512, 545, + 513, 545, 514, 549, // NOLINT + 515, 549, 516, 553, 517, 553, + 518, 557, 519, 557, 520, 561, + 521, 561, 522, 565, // NOLINT + 523, 565, 524, 569, 525, 569, + 526, 573, 527, 573, 528, 577, + 529, 577, 530, 581, // NOLINT + 531, 581, 532, 585, 533, 585, + 534, 589, 535, 589, 536, 593, + 537, 593, 538, 597, // NOLINT + 539, 597, 540, 601, 541, 601, + 542, 605, 543, 605, 544, 365, + 546, 609, 547, 609, // NOLINT + 548, 613, 549, 613, 550, 617, + 551, 617, 552, 621, 553, 621, + 554, 625, 555, 625, // NOLINT + 556, 629, 557, 629, 558, 633, + 559, 633, 560, 637, 561, 637, + 562, 641, 563, 641, // NOLINT + 570, 645, 571, 649, 572, 649, + 573, 353, 574, 653, 1073742399, 657, + 576, 661, 577, 665, // NOLINT + 578, 665, 579, 277, 580, 669, + 581, 673, 582, 677, 583, 677, + 584, 681, 585, 681, // NOLINT + 586, 685, 587, 685, 588, 689, + 589, 689, 590, 693, 591, 693, + 592, 697, 593, 701, // NOLINT + 594, 705, 595, 281, 596, 293, + 1073742422, 301, 599, 305, 601, 317, + 603, 321, 604, 709, // NOLINT + 608, 329, 609, 713, 611, 333, + 613, 717, 614, 721, 616, 345, + 617, 341, 619, 725, // NOLINT + 620, 729, 623, 357, 625, 733, + 626, 361, 629, 369, 637, 737, + 640, 385, 643, 393, // NOLINT + 647, 741, 648, 401, 649, 669, + 1073742474, 409, 651, 413, 652, 673, + 658, 425, 670, 745, // NOLINT + 837, 749, 880, 753, 881, 753, + 882, 757, 883, 757, 886, 761, + 887, 761, 1073742715, 765, // NOLINT + 893, 769, 895, 773, 902, 777, + 1073742728, 781, 906, 785, 908, 789, + 1073742734, 793, 911, 797, // NOLINT + 913, 801, 914, 805, 1073742739, 809, + 916, 813, 917, 817, 1073742742, 821, + 919, 825, 920, 829, // NOLINT + 921, 749, 922, 833, 923, 837, + 924, 9, 1073742749, 841, 927, 845, + 928, 849, 929, 853, // NOLINT + 931, 857, 1073742756, 861, 933, 865, + 934, 869, 1073742759, 873, 939, 877, + 940, 777, 1073742765, 781, // NOLINT + 943, 785, 945, 801, 946, 805, + 1073742771, 809, 948, 813, 949, 817, + 1073742774, 821, 951, 825, // NOLINT + 952, 829, 953, 749, 954, 833, + 955, 837, 956, 9, 1073742781, 841, + 959, 845, 960, 849, // NOLINT + 961, 853, 962, 857, 963, 857, + 1073742788, 861, 965, 865, 966, 869, + 1073742791, 873, 971, 877, // NOLINT + 972, 789, 1073742797, 793, 974, 797, + 975, 881, 976, 805, 977, 829, + 981, 869, 982, 849, // NOLINT + 983, 881, 984, 885, 985, 885, + 986, 889, 987, 889, 988, 893, + 989, 893, 990, 897, // NOLINT + 991, 897, 992, 901, 993, 901, + 994, 905, 995, 905, 996, 909, + 997, 909, 998, 913, // NOLINT + 999, 913, 1000, 917, 1001, 917, + 1002, 921, 1003, 921, 1004, 925, + 1005, 925, 1006, 929, // NOLINT + 1007, 929, 1008, 833, 1009, 853, + 1010, 933, 1011, 773, 1013, 817, + 1015, 937, 1016, 937, // NOLINT + 1017, 933, 1018, 941, 1019, 941, + 1073742845, 765, 1023, 769, 1073742848, 945, + 1039, 949, 1073742864, 953, // NOLINT + 1071, 957, 1073742896, 953, 1103, 957, + 1073742928, 945, 1119, 949, 1120, 961, + 1121, 961, 1122, 965, // NOLINT + 1123, 965, 1124, 969, 1125, 969, + 1126, 973, 1127, 973, 1128, 977, + 1129, 977, 1130, 981, // NOLINT + 1131, 981, 1132, 985, 1133, 985, + 1134, 989, 1135, 989, 1136, 993, + 1137, 993, 1138, 997, // NOLINT + 1139, 997, 1140, 1001, 1141, 1001, + 1142, 1005, 1143, 1005, 1144, 1009, + 1145, 1009, 1146, 1013, // NOLINT + 1147, 1013, 1148, 1017, 1149, 1017, + 1150, 1021, 1151, 1021, 1152, 1025, + 1153, 1025, 1162, 1029, // NOLINT + 1163, 1029, 1164, 1033, 1165, 1033, + 1166, 1037, 1167, 1037, 1168, 1041, + 1169, 1041, 1170, 1045, // NOLINT + 1171, 1045, 1172, 1049, 1173, 1049, + 1174, 1053, 1175, 1053, 1176, 1057, + 1177, 1057, 1178, 1061, // NOLINT + 1179, 1061, 1180, 1065, 1181, 1065, + 1182, 1069, 1183, 1069, 1184, 1073, + 1185, 1073, 1186, 1077, // NOLINT + 1187, 1077, 1188, 1081, 1189, 1081, + 1190, 1085, 1191, 1085, 1192, 1089, + 1193, 1089, 1194, 1093, // NOLINT + 1195, 1093, 1196, 1097, 1197, 1097, + 1198, 1101, 1199, 1101, 1200, 1105, + 1201, 1105, 1202, 1109, // NOLINT + 1203, 1109, 1204, 1113, 1205, 1113, + 1206, 1117, 1207, 1117, 1208, 1121, + 1209, 1121, 1210, 1125, // NOLINT + 1211, 1125, 1212, 1129, 1213, 1129, + 1214, 1133, 1215, 1133, 1216, 1137, + 1217, 1141, 1218, 1141, // NOLINT + 1219, 1145, 1220, 1145, 1221, 1149, + 1222, 1149, 1223, 1153, 1224, 1153, + 1225, 1157, 1226, 1157, // NOLINT + 1227, 1161, 1228, 1161, 1229, 1165, + 1230, 1165, 1231, 1137, 1232, 1169, + 1233, 1169, 1234, 1173, // NOLINT + 1235, 1173, 1236, 1177, 1237, 1177, + 1238, 1181, 1239, 1181, 1240, 1185, + 1241, 1185, 1242, 1189, // NOLINT + 1243, 1189, 1244, 1193, 1245, 1193, + 1246, 1197, 1247, 1197, 1248, 1201, + 1249, 1201, 1250, 1205, // NOLINT + 1251, 1205, 1252, 1209, 1253, 1209, + 1254, 1213, 1255, 1213, 1256, 1217, + 1257, 1217, 1258, 1221, // NOLINT + 1259, 1221, 1260, 1225, 1261, 1225, + 1262, 1229, 1263, 1229, 1264, 1233, + 1265, 1233, 1266, 1237, // NOLINT + 1267, 1237, 1268, 1241, 1269, 1241, + 1270, 1245, 1271, 1245, 1272, 1249, + 1273, 1249, 1274, 1253, // NOLINT + 1275, 1253, 1276, 1257, 1277, 1257, + 1278, 1261, 1279, 1261, 1280, 1265, + 1281, 1265, 1282, 1269, // NOLINT + 1283, 1269, 1284, 1273, 1285, 1273, + 1286, 1277, 1287, 1277, 1288, 1281, + 1289, 1281, 1290, 1285, // NOLINT + 1291, 1285, 1292, 1289, 1293, 1289, + 1294, 1293, 1295, 1293, 1296, 1297, + 1297, 1297, 1298, 1301, // NOLINT + 1299, 1301, 1300, 1305, 1301, 1305, + 1302, 1309, 1303, 1309, 1304, 1313, + 1305, 1313, 1306, 1317, // NOLINT + 1307, 1317, 1308, 1321, 1309, 1321, + 1310, 1325, 1311, 1325, 1312, 1329, + 1313, 1329, 1314, 1333, // NOLINT + 1315, 1333, 1316, 1337, 1317, 1337, + 1318, 1341, 1319, 1341, 1320, 1345, + 1321, 1345, 1322, 1349, // NOLINT + 1323, 1349, 1324, 1353, 1325, 1353, + 1326, 1357, 1327, 1357, 1073743153, 1361, + 1366, 1365, 1073743201, 1361, // NOLINT + 1414, 1365, 1073746080, 1369, 4293, 1373, + 4295, 1377, 4301, 1381, 7545, 1385, + 7549, 1389, 7680, 1393, // NOLINT + 7681, 1393, 7682, 1397, 7683, 1397, + 7684, 1401, 7685, 1401, 7686, 1405, + 7687, 1405, 7688, 1409, // NOLINT + 7689, 1409, 7690, 1413, 7691, 1413, + 7692, 1417, 7693, 1417, 7694, 1421, + 7695, 1421, 7696, 1425, // NOLINT + 7697, 1425, 7698, 1429, 7699, 1429, + 7700, 1433, 7701, 1433, 7702, 1437, + 7703, 1437, 7704, 1441, // NOLINT + 7705, 1441, 7706, 1445, 7707, 1445, + 7708, 1449, 7709, 1449, 7710, 1453, + 7711, 1453, 7712, 1457, // NOLINT + 7713, 1457, 7714, 1461, 7715, 1461, + 7716, 1465, 7717, 1465, 7718, 1469, + 7719, 1469, 7720, 1473, // NOLINT + 7721, 1473, 7722, 1477, 7723, 1477, + 7724, 1481, 7725, 1481, 7726, 1485, + 7727, 1485, 7728, 1489, // NOLINT + 7729, 1489, 7730, 1493, 7731, 1493, + 7732, 1497, 7733, 1497, 7734, 1501, + 7735, 1501, 7736, 1505, // NOLINT + 7737, 1505, 7738, 1509, 7739, 1509, + 7740, 1513, 7741, 1513, 7742, 1517, + 7743, 1517, 7744, 1521, // NOLINT + 7745, 1521, 7746, 1525, 7747, 1525, + 7748, 1529, 7749, 1529, 7750, 1533, + 7751, 1533, 7752, 1537, // NOLINT + 7753, 1537, 7754, 1541, 7755, 1541, + 7756, 1545, 7757, 1545, 7758, 1549, + 7759, 1549, 7760, 1553, // NOLINT + 7761, 1553, 7762, 1557, 7763, 1557, + 7764, 1561, 7765, 1561, 7766, 1565, + 7767, 1565, 7768, 1569, // NOLINT + 7769, 1569, 7770, 1573, 7771, 1573, + 7772, 1577, 7773, 1577, 7774, 1581, + 7775, 1581, 7776, 1585, // NOLINT + 7777, 1585, 7778, 1589, 7779, 1589, + 7780, 1593, 7781, 1593, 7782, 1597, + 7783, 1597, 7784, 1601, // NOLINT + 7785, 1601, 7786, 1605, 7787, 1605, + 7788, 1609, 7789, 1609, 7790, 1613, + 7791, 1613, 7792, 1617, // NOLINT + 7793, 1617, 7794, 1621, 7795, 1621, + 7796, 1625, 7797, 1625, 7798, 1629, + 7799, 1629, 7800, 1633, // NOLINT + 7801, 1633, 7802, 1637, 7803, 1637, + 7804, 1641, 7805, 1641, 7806, 1645, + 7807, 1645, 7808, 1649, // NOLINT + 7809, 1649, 7810, 1653, 7811, 1653, + 7812, 1657, 7813, 1657, 7814, 1661, + 7815, 1661, 7816, 1665, // NOLINT + 7817, 1665, 7818, 1669, 7819, 1669, + 7820, 1673, 7821, 1673, 7822, 1677, + 7823, 1677, 7824, 1681, // NOLINT + 7825, 1681, 7826, 1685, 7827, 1685, + 7828, 1689, 7829, 1689, 7835, 1585, + 7840, 1693, 7841, 1693, // NOLINT + 7842, 1697, 7843, 1697, 7844, 1701, + 7845, 1701, 7846, 1705, 7847, 1705, + 7848, 1709, 7849, 1709, // NOLINT + 7850, 1713, 7851, 1713, 7852, 1717, + 7853, 1717, 7854, 1721, 7855, 1721, + 7856, 1725, 7857, 1725, // NOLINT + 7858, 1729, 7859, 1729, 7860, 1733, + 7861, 1733, 7862, 1737, 7863, 1737, + 7864, 1741, 7865, 1741, // NOLINT + 7866, 1745, 7867, 1745, 7868, 1749, + 7869, 1749, 7870, 1753, 7871, 1753, + 7872, 1757, 7873, 1757, // NOLINT + 7874, 1761, 7875, 1761, 7876, 1765, + 7877, 1765, 7878, 1769, 7879, 1769, + 7880, 1773, 7881, 1773, // NOLINT + 7882, 1777, 7883, 1777, 7884, 1781, + 7885, 1781, 7886, 1785, 7887, 1785, + 7888, 1789, 7889, 1789, // NOLINT + 7890, 1793, 7891, 1793, 7892, 1797, + 7893, 1797, 7894, 1801, 7895, 1801, + 7896, 1805, 7897, 1805, // NOLINT + 7898, 1809, 7899, 1809, 7900, 1813, + 7901, 1813, 7902, 1817, 7903, 1817, + 7904, 1821, 7905, 1821, // NOLINT + 7906, 1825, 7907, 1825, 7908, 1829, + 7909, 1829, 7910, 1833, 7911, 1833, + 7912, 1837, 7913, 1837, // NOLINT + 7914, 1841, 7915, 1841, 7916, 1845, + 7917, 1845, 7918, 1849, 7919, 1849, + 7920, 1853, 7921, 1853, // NOLINT + 7922, 1857, 7923, 1857, 7924, 1861, + 7925, 1861, 7926, 1865, 7927, 1865, + 7928, 1869, 7929, 1869, // NOLINT + 7930, 1873, 7931, 1873, 7932, 1877, + 7933, 1877, 7934, 1881, 7935, 1881, + 1073749760, 1885, 7943, 1889, // NOLINT + 1073749768, 1885, 7951, 1889, 1073749776, 1893, + 7957, 1897, 1073749784, 1893, 7965, 1897, + 1073749792, 1901, 7975, 1905, // NOLINT + 1073749800, 1901, 7983, 1905, 1073749808, 1909, + 7991, 1913, 1073749816, 1909, 7999, 1913, + 1073749824, 1917, 8005, 1921, // NOLINT + 1073749832, 1917, 8013, 1921, 8017, 1925, + 8019, 1929, 8021, 1933, 8023, 1937, + 8025, 1925, 8027, 1929, // NOLINT + 8029, 1933, 8031, 1937, 1073749856, 1941, + 8039, 1945, 1073749864, 1941, 8047, 1945, + 1073749872, 1949, 8049, 1953, // NOLINT + 1073749874, 1957, 8053, 1961, 1073749878, 1965, + 8055, 1969, 1073749880, 1973, 8057, 1977, + 1073749882, 1981, 8059, 1985, // NOLINT + 1073749884, 1989, 8061, 1993, 1073749936, 1997, + 8113, 2001, 1073749944, 1997, 8121, 2001, + 1073749946, 1949, 8123, 1953, // NOLINT + 8126, 749, 1073749960, 1957, 8139, 1961, + 1073749968, 2005, 8145, 2009, 1073749976, 2005, + 8153, 2009, 1073749978, 1965, // NOLINT + 8155, 1969, 1073749984, 2013, 8161, 2017, + 8165, 2021, 1073749992, 2013, 8169, 2017, + 1073749994, 1981, 8171, 1985, // NOLINT + 8172, 2021, 1073750008, 1973, 8185, 1977, + 1073750010, 1989, 8187, 1993}; // NOLINT +static const MultiCharacterSpecialCase<2> + kEcma262UnCanonicalizeMultiStrings1[83] = { // NOLINT + {{8498, 8526}}, {{8544, 8560}}, {{8559, 8575}}, + {{8579, 8580}}, // NOLINT + {{9398, 9424}}, {{9423, 9449}}, {{11264, 11312}}, + {{11310, 11358}}, // NOLINT + {{11360, 11361}}, {{619, 11362}}, {{7549, 11363}}, + {{637, 11364}}, // NOLINT + {{570, 11365}}, {{574, 11366}}, {{11367, 11368}}, + {{11369, 11370}}, // NOLINT + {{11371, 11372}}, {{593, 11373}}, {{625, 11374}}, + {{592, 11375}}, // NOLINT + {{594, 11376}}, {{11378, 11379}}, {{11381, 11382}}, + {{575, 11390}}, // NOLINT + {{576, 11391}}, {{11392, 11393}}, {{11394, 11395}}, + {{11396, 11397}}, // NOLINT + {{11398, 11399}}, {{11400, 11401}}, {{11402, 11403}}, + {{11404, 11405}}, // NOLINT + {{11406, 11407}}, {{11408, 11409}}, {{11410, 11411}}, + {{11412, 11413}}, // NOLINT + {{11414, 11415}}, {{11416, 11417}}, {{11418, 11419}}, + {{11420, 11421}}, // NOLINT + {{11422, 11423}}, {{11424, 11425}}, {{11426, 11427}}, + {{11428, 11429}}, // NOLINT + {{11430, 11431}}, {{11432, 11433}}, {{11434, 11435}}, + {{11436, 11437}}, // NOLINT + {{11438, 11439}}, {{11440, 11441}}, {{11442, 11443}}, + {{11444, 11445}}, // NOLINT + {{11446, 11447}}, {{11448, 11449}}, {{11450, 11451}}, + {{11452, 11453}}, // NOLINT + {{11454, 11455}}, {{11456, 11457}}, {{11458, 11459}}, + {{11460, 11461}}, // NOLINT + {{11462, 11463}}, {{11464, 11465}}, {{11466, 11467}}, + {{11468, 11469}}, // NOLINT + {{11470, 11471}}, {{11472, 11473}}, {{11474, 11475}}, + {{11476, 11477}}, // NOLINT + {{11478, 11479}}, {{11480, 11481}}, {{11482, 11483}}, + {{11484, 11485}}, // NOLINT + {{11486, 11487}}, {{11488, 11489}}, {{11490, 11491}}, + {{11499, 11500}}, // NOLINT + {{11501, 11502}}, {{11506, 11507}}, {{4256, 11520}}, + {{4293, 11557}}, // NOLINT + {{4295, 11559}}, {{4301, 11565}}, {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262UnCanonicalizeTable1Size = 149; // NOLINT +static const int32_t kEcma262UnCanonicalizeTable1[298] = { + 306, 1, 334, 1, 1073742176, 5, 367, 9, + 1073742192, 5, 383, 9, 387, 13, 388, 13, // NOLINT + 1073743030, 17, 1231, 21, 1073743056, 17, 1257, 21, + 1073744896, 25, 3118, 29, 1073744944, 25, 3166, 29, // NOLINT + 3168, 33, 3169, 33, 3170, 37, 3171, 41, + 3172, 45, 3173, 49, 3174, 53, 3175, 57, // NOLINT + 3176, 57, 3177, 61, 3178, 61, 3179, 65, + 3180, 65, 3181, 69, 3182, 73, 3183, 77, // NOLINT + 3184, 81, 3186, 85, 3187, 85, 3189, 89, + 3190, 89, 1073745022, 93, 3199, 97, 3200, 101, // NOLINT + 3201, 101, 3202, 105, 3203, 105, 3204, 109, + 3205, 109, 3206, 113, 3207, 113, 3208, 117, // NOLINT + 3209, 117, 3210, 121, 3211, 121, 3212, 125, + 3213, 125, 3214, 129, 3215, 129, 3216, 133, // NOLINT + 3217, 133, 3218, 137, 3219, 137, 3220, 141, + 3221, 141, 3222, 145, 3223, 145, 3224, 149, // NOLINT + 3225, 149, 3226, 153, 3227, 153, 3228, 157, + 3229, 157, 3230, 161, 3231, 161, 3232, 165, // NOLINT + 3233, 165, 3234, 169, 3235, 169, 3236, 173, + 3237, 173, 3238, 177, 3239, 177, 3240, 181, // NOLINT + 3241, 181, 3242, 185, 3243, 185, 3244, 189, + 3245, 189, 3246, 193, 3247, 193, 3248, 197, // NOLINT + 3249, 197, 3250, 201, 3251, 201, 3252, 205, + 3253, 205, 3254, 209, 3255, 209, 3256, 213, // NOLINT + 3257, 213, 3258, 217, 3259, 217, 3260, 221, + 3261, 221, 3262, 225, 3263, 225, 3264, 229, // NOLINT + 3265, 229, 3266, 233, 3267, 233, 3268, 237, + 3269, 237, 3270, 241, 3271, 241, 3272, 245, // NOLINT + 3273, 245, 3274, 249, 3275, 249, 3276, 253, + 3277, 253, 3278, 257, 3279, 257, 3280, 261, // NOLINT + 3281, 261, 3282, 265, 3283, 265, 3284, 269, + 3285, 269, 3286, 273, 3287, 273, 3288, 277, // NOLINT + 3289, 277, 3290, 281, 3291, 281, 3292, 285, + 3293, 285, 3294, 289, 3295, 289, 3296, 293, // NOLINT + 3297, 293, 3298, 297, 3299, 297, 3307, 301, + 3308, 301, 3309, 305, 3310, 305, 3314, 309, // NOLINT + 3315, 309, 1073745152, 313, 3365, 317, 3367, 321, + 3373, 325}; // NOLINT +static const MultiCharacterSpecialCase<2> + kEcma262UnCanonicalizeMultiStrings5[104] = { // NOLINT + {{42560, 42561}}, {{42562, 42563}}, + {{42564, 42565}}, {{42566, 42567}}, // NOLINT + {{42568, 42569}}, {{42570, 42571}}, + {{42572, 42573}}, {{42574, 42575}}, // NOLINT + {{42576, 42577}}, {{42578, 42579}}, + {{42580, 42581}}, {{42582, 42583}}, // NOLINT + {{42584, 42585}}, {{42586, 42587}}, + {{42588, 42589}}, {{42590, 42591}}, // NOLINT + {{42592, 42593}}, {{42594, 42595}}, + {{42596, 42597}}, {{42598, 42599}}, // NOLINT + {{42600, 42601}}, {{42602, 42603}}, + {{42604, 42605}}, {{42624, 42625}}, // NOLINT + {{42626, 42627}}, {{42628, 42629}}, + {{42630, 42631}}, {{42632, 42633}}, // NOLINT + {{42634, 42635}}, {{42636, 42637}}, + {{42638, 42639}}, {{42640, 42641}}, // NOLINT + {{42642, 42643}}, {{42644, 42645}}, + {{42646, 42647}}, {{42648, 42649}}, // NOLINT + {{42650, 42651}}, {{42786, 42787}}, + {{42788, 42789}}, {{42790, 42791}}, // NOLINT + {{42792, 42793}}, {{42794, 42795}}, + {{42796, 42797}}, {{42798, 42799}}, // NOLINT + {{42802, 42803}}, {{42804, 42805}}, + {{42806, 42807}}, {{42808, 42809}}, // NOLINT + {{42810, 42811}}, {{42812, 42813}}, + {{42814, 42815}}, {{42816, 42817}}, // NOLINT + {{42818, 42819}}, {{42820, 42821}}, + {{42822, 42823}}, {{42824, 42825}}, // NOLINT + {{42826, 42827}}, {{42828, 42829}}, + {{42830, 42831}}, {{42832, 42833}}, // NOLINT + {{42834, 42835}}, {{42836, 42837}}, + {{42838, 42839}}, {{42840, 42841}}, // NOLINT + {{42842, 42843}}, {{42844, 42845}}, + {{42846, 42847}}, {{42848, 42849}}, // NOLINT + {{42850, 42851}}, {{42852, 42853}}, + {{42854, 42855}}, {{42856, 42857}}, // NOLINT + {{42858, 42859}}, {{42860, 42861}}, + {{42862, 42863}}, {{42873, 42874}}, // NOLINT + {{42875, 42876}}, {{7545, 42877}}, + {{42878, 42879}}, {{42880, 42881}}, // NOLINT + {{42882, 42883}}, {{42884, 42885}}, + {{42886, 42887}}, {{42891, 42892}}, // NOLINT + {{613, 42893}}, {{42896, 42897}}, + {{42898, 42899}}, {{42902, 42903}}, // NOLINT + {{42904, 42905}}, {{42906, 42907}}, + {{42908, 42909}}, {{42910, 42911}}, // NOLINT + {{42912, 42913}}, {{42914, 42915}}, + {{42916, 42917}}, {{42918, 42919}}, // NOLINT + {{42920, 42921}}, {{614, 42922}}, + {{604, 42923}}, {{609, 42924}}, // NOLINT + {{620, 42925}}, {{670, 42928}}, + {{647, 42929}}, {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262UnCanonicalizeTable5Size = 198; // NOLINT +static const int32_t + kEcma262UnCanonicalizeTable5[396] = + {1600, 1, 1601, 1, 1602, 5, 1603, 5, + 1604, 9, 1605, 9, 1606, 13, 1607, 13, // NOLINT + 1608, 17, 1609, 17, 1610, 21, 1611, 21, + 1612, 25, 1613, 25, 1614, 29, 1615, 29, // NOLINT + 1616, 33, 1617, 33, 1618, 37, 1619, 37, + 1620, 41, 1621, 41, 1622, 45, 1623, 45, // NOLINT + 1624, 49, 1625, 49, 1626, 53, 1627, 53, + 1628, 57, 1629, 57, 1630, 61, 1631, 61, // NOLINT + 1632, 65, 1633, 65, 1634, 69, 1635, 69, + 1636, 73, 1637, 73, 1638, 77, 1639, 77, // NOLINT + 1640, 81, 1641, 81, 1642, 85, 1643, 85, + 1644, 89, 1645, 89, 1664, 93, 1665, 93, // NOLINT + 1666, 97, 1667, 97, 1668, 101, 1669, 101, + 1670, 105, 1671, 105, 1672, 109, 1673, 109, // NOLINT + 1674, 113, 1675, 113, 1676, 117, 1677, 117, + 1678, 121, 1679, 121, 1680, 125, 1681, 125, // NOLINT + 1682, 129, 1683, 129, 1684, 133, 1685, 133, + 1686, 137, 1687, 137, 1688, 141, 1689, 141, // NOLINT + 1690, 145, 1691, 145, 1826, 149, 1827, 149, + 1828, 153, 1829, 153, 1830, 157, 1831, 157, // NOLINT + 1832, 161, 1833, 161, 1834, 165, 1835, 165, + 1836, 169, 1837, 169, 1838, 173, 1839, 173, // NOLINT + 1842, 177, 1843, 177, 1844, 181, 1845, 181, + 1846, 185, 1847, 185, 1848, 189, 1849, 189, // NOLINT + 1850, 193, 1851, 193, 1852, 197, 1853, 197, + 1854, 201, 1855, 201, 1856, 205, 1857, 205, // NOLINT + 1858, 209, 1859, 209, 1860, 213, 1861, 213, + 1862, 217, 1863, 217, 1864, 221, 1865, 221, // NOLINT + 1866, 225, 1867, 225, 1868, 229, 1869, 229, + 1870, 233, 1871, 233, 1872, 237, 1873, 237, // NOLINT + 1874, 241, 1875, 241, 1876, 245, 1877, 245, + 1878, 249, 1879, 249, 1880, 253, 1881, 253, // NOLINT + 1882, 257, 1883, 257, 1884, 261, 1885, 261, + 1886, 265, 1887, 265, 1888, 269, 1889, 269, // NOLINT + 1890, 273, 1891, 273, 1892, 277, 1893, 277, + 1894, 281, 1895, 281, 1896, 285, 1897, 285, // NOLINT + 1898, 289, 1899, 289, 1900, 293, 1901, 293, + 1902, 297, 1903, 297, 1913, 301, 1914, 301, // NOLINT + 1915, 305, 1916, 305, 1917, 309, 1918, 313, + 1919, 313, 1920, 317, 1921, 317, 1922, 321, // NOLINT + 1923, 321, 1924, 325, 1925, 325, 1926, 329, + 1927, 329, 1931, 333, 1932, 333, 1933, 337, // NOLINT + 1936, 341, 1937, 341, 1938, 345, 1939, 345, + 1942, 349, 1943, 349, 1944, 353, 1945, 353, // NOLINT + 1946, 357, 1947, 357, 1948, 361, 1949, 361, + 1950, 365, 1951, 365, 1952, 369, 1953, 369, // NOLINT + 1954, 373, 1955, 373, 1956, 377, 1957, 377, + 1958, 381, 1959, 381, 1960, 385, 1961, 385, // NOLINT + 1962, 389, 1963, 393, 1964, 397, 1965, 401, + 1968, 405, 1969, 409}; // NOLINT +static const MultiCharacterSpecialCase<2> + kEcma262UnCanonicalizeMultiStrings7[3] = { // NOLINT + {{65313, 65345}}, + {{65338, 65370}}, + {{kSentinel}}}; // NOLINT +static const uint16_t kEcma262UnCanonicalizeTable7Size = 4; // NOLINT +static const int32_t kEcma262UnCanonicalizeTable7[8] = { + 1073749793, 1, 7994, 5, 1073749825, 1, 8026, 5}; // NOLINT +int Ecma262UnCanonicalize::Convert(uchar c, uchar n, uchar* result, + bool* allow_caching_ptr) { + int chunk_index = c >> 13; + switch (chunk_index) { + case 0: + return LookupMapping<true>( + kEcma262UnCanonicalizeTable0, kEcma262UnCanonicalizeTable0Size, + kEcma262UnCanonicalizeMultiStrings0, c, n, result, allow_caching_ptr); + case 1: + return LookupMapping<true>( + kEcma262UnCanonicalizeTable1, kEcma262UnCanonicalizeTable1Size, + kEcma262UnCanonicalizeMultiStrings1, c, n, result, allow_caching_ptr); + case 5: + return LookupMapping<true>( + kEcma262UnCanonicalizeTable5, kEcma262UnCanonicalizeTable5Size, + kEcma262UnCanonicalizeMultiStrings5, c, n, result, allow_caching_ptr); + case 7: + return LookupMapping<true>( + kEcma262UnCanonicalizeTable7, kEcma262UnCanonicalizeTable7Size, + kEcma262UnCanonicalizeMultiStrings7, c, n, result, allow_caching_ptr); + default: + return 0; + } +} + +static const MultiCharacterSpecialCase<1> + kCanonicalizationRangeMultiStrings0[1] = { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kCanonicalizationRangeTable0Size = 70; // NOLINT +static const int32_t kCanonicalizationRangeTable0[140] = { + 1073741889, 100, 90, 0, 1073741921, 100, 122, 0, + 1073742016, 88, 214, 0, 1073742040, 24, 222, 0, // NOLINT + 1073742048, 88, 246, 0, 1073742072, 24, 254, 0, + 1073742715, 8, 893, 0, 1073742728, 8, 906, 0, // NOLINT + 1073742749, 8, 927, 0, 1073742759, 16, 939, 0, + 1073742765, 8, 943, 0, 1073742781, 8, 959, 0, // NOLINT + 1073742791, 16, 971, 0, 1073742845, 8, 1023, 0, + 1073742848, 60, 1039, 0, 1073742864, 124, 1071, 0, // NOLINT + 1073742896, 124, 1103, 0, 1073742928, 60, 1119, 0, + 1073743153, 148, 1366, 0, 1073743201, 148, 1414, 0, // NOLINT + 1073746080, 148, 4293, 0, 1073749760, 28, 7943, 0, + 1073749768, 28, 7951, 0, 1073749776, 20, 7957, 0, // NOLINT + 1073749784, 20, 7965, 0, 1073749792, 28, 7975, 0, + 1073749800, 28, 7983, 0, 1073749808, 28, 7991, 0, // NOLINT + 1073749816, 28, 7999, 0, 1073749824, 20, 8005, 0, + 1073749832, 20, 8013, 0, 1073749856, 28, 8039, 0, // NOLINT + 1073749864, 28, 8047, 0, 1073749874, 12, 8053, 0, + 1073749960, 12, 8139, 0}; // NOLINT +static const MultiCharacterSpecialCase<1> + kCanonicalizationRangeMultiStrings1[1] = { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kCanonicalizationRangeTable1Size = 14; // NOLINT +static const int32_t kCanonicalizationRangeTable1[28] = { + 1073742176, 60, 367, 0, 1073742192, 60, 383, 0, + 1073743030, 100, 1231, 0, 1073743056, 100, 1257, 0, // NOLINT + 1073744896, 184, 3118, 0, 1073744944, 184, 3166, 0, + 1073745152, 148, 3365, 0}; // NOLINT +static const MultiCharacterSpecialCase<1> + kCanonicalizationRangeMultiStrings7[1] = { // NOLINT + {{kSentinel}}}; // NOLINT +static const uint16_t kCanonicalizationRangeTable7Size = 4; // NOLINT +static const int32_t kCanonicalizationRangeTable7[8] = { + 1073749793, 100, 7994, 0, 1073749825, 100, 8026, 0}; // NOLINT +int CanonicalizationRange::Convert(uchar c, uchar n, uchar* result, + bool* allow_caching_ptr) { + int chunk_index = c >> 13; + switch (chunk_index) { + case 0: + return LookupMapping<false>( + kCanonicalizationRangeTable0, kCanonicalizationRangeTable0Size, + kCanonicalizationRangeMultiStrings0, c, n, result, allow_caching_ptr); + case 1: + return LookupMapping<false>( + kCanonicalizationRangeTable1, kCanonicalizationRangeTable1Size, + kCanonicalizationRangeMultiStrings1, c, n, result, allow_caching_ptr); + case 7: + return LookupMapping<false>( + kCanonicalizationRangeTable7, kCanonicalizationRangeTable7Size, + kCanonicalizationRangeMultiStrings7, c, n, result, allow_caching_ptr); + default: + return 0; + } +} + +#endif // !V8_INTL_SUPPORT + +} // namespace unibrow +} // namespace v8 diff --git a/js/src/irregexp/util/VectorShim.h b/js/src/irregexp/util/VectorShim.h new file mode 100644 index 0000000000..1b8882f234 --- /dev/null +++ b/js/src/irregexp/util/VectorShim.h @@ -0,0 +1,231 @@ +// Copyright 2014 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_UTIL_VECTOR_H_ +#define V8_UTIL_VECTOR_H_ + +#include <algorithm> +#include <cstring> +#include <iterator> +#include <memory> + +#include "js/AllocPolicy.h" +#include "js/Utility.h" +#include "js/Vector.h" + +namespace v8 { +namespace internal { + +////////////////////////////////////////////////// + +// Adapted from: +// https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/allocation.h#L36-L58 + +template <typename T> +T* NewArray(size_t size) { + static_assert(std::is_pod<T>::value, ""); + js::AutoEnterOOMUnsafeRegion oomUnsafe; + T* result = static_cast<T*>(js_malloc(size * sizeof(T))); + if (!result) { + oomUnsafe.crash("Irregexp NewArray"); + } + return result; +} + +template <typename T> +void DeleteArray(T* array) { + js_free(array); +} + +} // namespace internal + +namespace base { + +////////////////////////////////////////////////// + +// A non-resizable vector containing a pointer and a length. +// The Vector may or may not own the pointer, depending on context. +// Origin: +// https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/vector.h#L20-L134 + +template <typename T> +class Vector { + public: + constexpr Vector() : start_(nullptr), length_(0) {} + + constexpr Vector(T* data, size_t length) : start_(data), length_(length) { + MOZ_ASSERT_IF(length != 0, data != nullptr); + } + + static Vector<T> New(size_t length) { + return Vector<T>(v8::internal::NewArray<T>(length), length); + } + + // Returns a vector using the same backing storage as this one, + // spanning from and including 'from', to but not including 'to'. + Vector<T> SubVector(size_t from, size_t to) const { + MOZ_ASSERT(from <= to); + MOZ_ASSERT(to <= length_); + return Vector<T>(begin() + from, to - from); + } + + // Returns the length of the vector. Only use this if you really need an + // integer return value. Use {size()} otherwise. + int length() const { + MOZ_ASSERT(length_ <= static_cast<size_t>(std::numeric_limits<int>::max())); + return static_cast<int>(length_); + } + + // Returns the length of the vector as a size_t. + constexpr size_t size() const { return length_; } + + // Returns whether or not the vector is empty. + constexpr bool empty() const { return length_ == 0; } + + // Access individual vector elements - checks bounds in debug mode. + T& operator[](size_t index) const { + MOZ_ASSERT(index < length_); + return start_[index]; + } + + const T& at(size_t index) const { return operator[](index); } + + T& first() { return start_[0]; } + + T& last() { + MOZ_ASSERT(length_ > 0); + return start_[length_ - 1]; + } + + // Returns a pointer to the start of the data in the vector. + constexpr T* begin() const { return start_; } + + // Returns a pointer past the end of the data in the vector. + constexpr T* end() const { return start_ + length_; } + + // Returns a clone of this vector with a new backing store. + Vector<T> Clone() const { + T* result = v8::internal::NewArray<T>(length_); + for (size_t i = 0; i < length_; i++) result[i] = start_[i]; + return Vector<T>(result, length_); + } + + void Truncate(size_t length) { + MOZ_ASSERT(length <= length_); + length_ = length; + } + + // Releases the array underlying this vector. Once disposed the + // vector is empty. + void Dispose() { + DeleteArray(start_); + start_ = nullptr; + length_ = 0; + } + + Vector<T> operator+(size_t offset) const { + MOZ_ASSERT(offset <= length_); + return Vector<T>(start_ + offset, length_ - offset); + } + + Vector<T> operator+=(size_t offset) { + MOZ_ASSERT(offset <= length_); + start_ += offset; + length_ -= offset; + return *this; + } + + // Implicit conversion from Vector<T> to Vector<const T>. + inline operator Vector<const T>() const { + return Vector<const T>::cast(*this); + } + + template <typename S> + static constexpr Vector<T> cast(Vector<S> input) { + return Vector<T>(reinterpret_cast<T*>(input.begin()), + input.length() * sizeof(S) / sizeof(T)); + } + + bool operator==(const Vector<const T> other) const { + if (length_ != other.length_) return false; + if (start_ == other.start_) return true; + for (size_t i = 0; i < length_; ++i) { + if (start_[i] != other.start_[i]) { + return false; + } + } + return true; + } + + private: + T* start_; + size_t length_; +}; + +// The resulting vector does not contain a null-termination byte. If you want +// the null byte, use ArrayVector("foo"). +inline Vector<const char> CStrVector(const char* data) { + return Vector<const char>(data, strlen(data)); +} + +// Construct a Vector from a start pointer and a size. +template <typename T> +inline constexpr Vector<T> VectorOf(T* start, size_t size) { + return {start, size}; +} + +class DefaultAllocator { + public: + using Policy = js::SystemAllocPolicy; + Policy policy() const { return js::SystemAllocPolicy(); } +}; + +// SmallVector uses inline storage first, and reallocates when full. +// It is basically equivalent to js::Vector, and is implemented +// as a thin wrapper. +// V8's implementation: +// https://github.com/v8/v8/blob/main/src/base/small-vector.h +template <typename T, size_t kSize, typename Allocator = DefaultAllocator> +class SmallVector { + public: + explicit SmallVector(const Allocator& allocator = DefaultAllocator()) + : inner_(allocator.policy()) {} + SmallVector(size_t size) { resize_no_init(size); } + + inline bool empty() const { return inner_.empty(); } + inline const T& back() const { return inner_.back(); } + inline void pop_back() { inner_.popBack(); }; + template <typename... Args> + inline void emplace_back(Args&&... args) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + if (!inner_.emplaceBack(args...)) { + oomUnsafe.crash("Irregexp SmallVector emplace_back"); + } + }; + inline size_t size() const { return inner_.length(); } + inline const T& at(size_t index) const { return inner_[index]; } + T* data() { return inner_.begin(); } + T* begin() { return inner_.begin(); } + + T& operator[](size_t index) { return inner_[index]; } + const T& operator[](size_t index) const { return inner_[index]; } + + inline void clear() { inner_.clear(); } + + void resize_no_init(size_t new_size) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + if (!inner_.resizeUninitialized(new_size)) { + oomUnsafe.crash("Irregexp SmallVector resize"); + } + } + + private: + js::Vector<T, kSize, typename Allocator::Policy> inner_; +}; + +} // namespace base + +} // namespace v8 + +#endif // V8_UTIL_VECTOR_H_ diff --git a/js/src/irregexp/util/ZoneShim.h b/js/src/irregexp/util/ZoneShim.h new file mode 100644 index 0000000000..7cb1ea650d --- /dev/null +++ b/js/src/irregexp/util/ZoneShim.h @@ -0,0 +1,403 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_UTIL_ZONE_H_ +#define V8_UTIL_ZONE_H_ + +#include <list> +#include <map> +#include <set> +#include <unordered_map> +#include <vector> + +#include "ds/LifoAlloc.h" +#include "ds/Sort.h" +#include "irregexp/util/VectorShim.h" + +namespace v8 { +namespace internal { + +// V8::Zone ~= LifoAlloc +class Zone { + public: + Zone(js::LifoAlloc& alloc) : lifoAlloc_(alloc) {} + + template <typename T, typename... Args> + T* New(Args&&... args) { + js::LifoAlloc::AutoFallibleScope fallible(&lifoAlloc_); + js::AutoEnterOOMUnsafeRegion oomUnsafe; + void* memory = lifoAlloc_.alloc(sizeof(T)); + if (!memory) { + oomUnsafe.crash("Irregexp Zone::New"); + } + return new (memory) T(std::forward<Args>(args)...); + } + + // Allocates uninitialized memory for 'length' number of T instances. + template <typename T> + T* NewArray(size_t length) { + js::LifoAlloc::AutoFallibleScope fallible(&lifoAlloc_); + js::AutoEnterOOMUnsafeRegion oomUnsafe; + void* memory = lifoAlloc_.alloc(length * sizeof(T)); + if (!memory) { + oomUnsafe.crash("Irregexp Zone::New"); + } + return static_cast<T*>(memory); + } + + void DeleteAll() { lifoAlloc_.freeAll(); } + + // Returns true if the total memory allocated exceeds a threshold. + static const size_t kExcessLimit = 256 * 1024 * 1024; + bool excess_allocation() const { + return lifoAlloc_.computedSizeOfExcludingThis() > kExcessLimit; + } + + js::LifoAlloc& inner() { return lifoAlloc_; } + + private: + js::LifoAlloc& lifoAlloc_; +}; + +// Superclass for classes allocated in a Zone. +// Based on: https://github.com/v8/v8/blob/master/src/zone/zone.h +class ZoneObject { + public: + // new (zone) SomeObject(...) was the old pattern. + // Delete the constructor to avoid using it accidentally. + void* operator new(size_t size, Zone* zone) = delete; + + // Allow non-allocating placement new + void* operator new(size_t size, void* ptr) { return ptr; } + + // Ideally, the delete operator should be private instead of + // public, but unfortunately the compiler sometimes synthesizes + // (unused) destructors for classes derived from ZoneObject, which + // require the operator to be visible. MSVC requires the delete + // operator to be public. + + // ZoneObjects should never be deleted individually; use + // Zone::DeleteAll() to delete all zone objects in one go. + void operator delete(void*, size_t) { MOZ_CRASH("unreachable"); } + void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); } +}; + +// ZoneLists are growable lists with constant-time access to the +// elements. The list itself and all its elements are allocated in the +// Zone. ZoneLists cannot be deleted individually; you can delete all +// objects in the Zone by calling Zone::DeleteAll(). +// Used throughout irregexp. +// Based on: https://github.com/v8/v8/blob/master/src/zone/zone-list.h +template <typename T> +class ZoneList final : public ZoneObject { + public: + // Construct a new ZoneList with the given capacity; the length is + // always zero. The capacity must be non-negative. + ZoneList(int capacity, Zone* zone) : capacity_(capacity) { + data_ = (capacity_ > 0) ? zone->NewArray<T>(capacity_) : nullptr; + } + // Construct a new ZoneList by copying the elements of the given ZoneList. + ZoneList(const ZoneList<T>& other, Zone* zone) + : ZoneList(other.length(), zone) { + AddAll(other, zone); + } + + // Construct a new ZoneList by copying the elements of the given vector. + ZoneList(const base::Vector<const T>& other, Zone* zone) + : ZoneList(other.length(), zone) { + AddAll(other, zone); + } + + ZoneList(ZoneList<T>&& other) { *this = std::move(other); } + + ZoneList& operator=(ZoneList&& other) { + MOZ_ASSERT(!data_); + data_ = other.data_; + capacity_ = other.capacity_; + length_ = other.length_; + other.Clear(); + return *this; + } + + // Returns a reference to the element at index i. This reference is not safe + // to use after operations that can change the list's backing store + // (e.g. Add). + inline T& operator[](int i) const { + MOZ_ASSERT(i >= 0); + MOZ_ASSERT(static_cast<unsigned>(i) < static_cast<unsigned>(length_)); + return data_[i]; + } + inline T& at(int i) const { return operator[](i); } + inline T& last() const { return at(length_ - 1); } + inline T& first() const { return at(0); } + + using iterator = T*; + inline iterator begin() const { return &data_[0]; } + inline iterator end() const { return &data_[length_]; } + + inline bool is_empty() const { return length_ == 0; } + inline int length() const { return length_; } + inline int capacity() const { return capacity_; } + + base::Vector<T> ToVector() const { return base::Vector<T>(data_, length_); } + base::Vector<T> ToVector(int start, int length) const { + return base::Vector<T>(data_ + start, std::min(length_ - start, length)); + } + + base::Vector<const T> ToConstVector() const { + return base::Vector<const T>(data_, length_); + } + + // Adds a copy of the given 'element' to the end of the list, + // expanding the list if necessary. + void Add(const T& element, Zone* zone) { + if (length_ < capacity_) { + data_[length_++] = element; + } else { + ZoneList<T>::ResizeAdd(element, zone); + } + } + // Add all the elements from the argument list to this list. + void AddAll(const ZoneList<T>& other, Zone* zone) { + AddAll(other.ToVector(), zone); + } + // Add all the elements from the vector to this list. + void AddAll(const base::Vector<const T>& other, Zone* zone) { + int result_length = length_ + other.length(); + if (capacity_ < result_length) { + Resize(result_length, zone); + } + if (std::is_fundamental<T>()) { + memcpy(data_ + length_, other.begin(), sizeof(*data_) * other.length()); + } else { + for (int i = 0; i < other.length(); i++) { + data_[length_ + i] = other.at(i); + } + } + length_ = result_length; + } + + // Overwrites the element at the specific index. + void Set(int index, const T& element) { + MOZ_ASSERT(index >= 0 && index <= length_); + data_[index] = element; + } + + // Removes the i'th element without deleting it even if T is a + // pointer type; moves all elements above i "down". Returns the + // removed element. This function's complexity is linear in the + // size of the list. + T Remove(int i) { + T element = at(i); + length_--; + while (i < length_) { + data_[i] = data_[i + 1]; + i++; + } + return element; + } + + // Removes the last element without deleting it even if T is a + // pointer type. Returns the removed element. + inline T RemoveLast() { return Remove(length_ - 1); } + + // Clears the list, setting the capacity and length to 0. + inline void Clear() { + data_ = nullptr; + capacity_ = 0; + length_ = 0; + } + + // Drops all but the first 'pos' elements from the list. + inline void Rewind(int pos) { + MOZ_ASSERT(0 <= pos && pos <= length_); + length_ = pos; + } + + inline bool Contains(const T& elm) const { + for (int i = 0; i < length_; i++) { + if (data_[i] == elm) return true; + } + return false; + } + + template <typename CompareFunction> + void StableSort(CompareFunction cmp, size_t start, size_t length) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + T* scratch = static_cast<T*>(js_malloc(length * sizeof(T))); + if (!scratch) { + oomUnsafe.crash("Irregexp stable sort scratch space"); + } + auto comparator = [cmp](const T& a, const T& b, bool* lessOrEqual) { + *lessOrEqual = cmp(&a, &b) <= 0; + return true; + }; + MOZ_ALWAYS_TRUE( + js::MergeSort(begin() + start, length, scratch, comparator)); + js_free(scratch); + } + + void operator delete(void* pointer) { MOZ_CRASH("unreachable"); } + void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); } + + private: + T* data_ = nullptr; + int capacity_ = 0; + int length_ = 0; + + // Increase the capacity of a full list, and add an element. + // List must be full already. + void ResizeAdd(const T& element, Zone* zone) { + MOZ_ASSERT(length_ >= capacity_); + // Grow the list capacity by 100%, but make sure to let it grow + // even when the capacity is zero (possible initial case). + int new_capacity = 1 + 2 * capacity_; + // Since the element reference could be an element of the list, copy + // it out of the old backing storage before resizing. + T temp = element; + Resize(new_capacity, zone); + data_[length_++] = temp; + } + + // Resize the list. + void Resize(int new_capacity, Zone* zone) { + MOZ_ASSERT(length_ <= new_capacity); + static_assert(std::is_trivially_copyable<T>::value); + T* new_data = zone->NewArray<T>(new_capacity); + if (length_ > 0) { + memcpy(new_data, data_, length_ * sizeof(T)); + } + data_ = new_data; + capacity_ = new_capacity; + } + + ZoneList& operator=(const ZoneList&) = delete; + ZoneList() = delete; + ZoneList(const ZoneList&) = delete; +}; + +// Based on: https://github.com/v8/v8/blob/master/src/zone/zone-allocator.h +template <typename T> +class ZoneAllocator { + public: + using pointer = T*; + using const_pointer = const T*; + using reference = T&; + using const_reference = const T&; + using value_type = T; + using size_type = size_t; + using difference_type = ptrdiff_t; + template <class O> + struct rebind { + using other = ZoneAllocator<O>; + }; + + explicit ZoneAllocator(Zone* zone) : zone_(zone) {} + template <typename U> + ZoneAllocator(const ZoneAllocator<U>& other) + : ZoneAllocator<T>(other.zone_) {} + template <typename U> + friend class ZoneAllocator; + + T* allocate(size_t n) { return zone_->NewArray<T>(n); } + void deallocate(T* p, size_t) {} // noop for zones + + bool operator==(ZoneAllocator const& other) const { + return zone_ == other.zone_; + } + bool operator!=(ZoneAllocator const& other) const { + return zone_ != other.zone_; + } + + using Policy = js::LifoAllocPolicy<js::Fallible>; + Policy policy() const { + return js::LifoAllocPolicy<js::Fallible>(zone_->inner()); + } + + private: + Zone* zone_; +}; + +// Zone wrappers for std containers: +// Origin: +// https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-containers.h#L25-L169 + +// A wrapper subclass for std::vector to make it easy to construct one +// that uses a zone allocator. +// Used throughout irregexp +template <typename T> +class ZoneVector : public std::vector<T, ZoneAllocator<T>> { + public: + ZoneVector(Zone* zone) + : std::vector<T, ZoneAllocator<T>>(ZoneAllocator<T>(zone)) {} + + // Constructs a new vector and fills it with {size} elements, each + // constructed via the default constructor. + ZoneVector(size_t size, Zone* zone) + : std::vector<T, ZoneAllocator<T>>(size, T(), ZoneAllocator<T>(zone)) {} + + // Constructs a new vector and fills it with the contents of the range + // [first, last). + template <class Iter> + ZoneVector(Iter first, Iter last, Zone* zone) + : std::vector<T, ZoneAllocator<T>>(first, last, ZoneAllocator<T>(zone)) {} +}; + +// A wrapper subclass for std::list to make it easy to construct one +// that uses a zone allocator. +// Used in regexp-bytecode-peephole.cc +template <typename T> +class ZoneLinkedList : public std::list<T, ZoneAllocator<T>> { + public: + // Constructs an empty list. + explicit ZoneLinkedList(Zone* zone) + : std::list<T, ZoneAllocator<T>>(ZoneAllocator<T>(zone)) {} +}; + +// A wrapper subclass for std::set to make it easy to construct one that uses +// a zone allocator. +// Used in regexp-parser.cc +template <typename K, typename Compare = std::less<K>> +class ZoneSet : public std::set<K, Compare, ZoneAllocator<K>> { + public: + // Constructs an empty set. + explicit ZoneSet(Zone* zone) + : std::set<K, Compare, ZoneAllocator<K>>(Compare(), + ZoneAllocator<K>(zone)) {} +}; + +// A wrapper subclass for std::map to make it easy to construct one that uses +// a zone allocator. +// Used in regexp-bytecode-peephole.cc +template <typename K, typename V, typename Compare = std::less<K>> +class ZoneMap + : public std::map<K, V, Compare, ZoneAllocator<std::pair<const K, V>>> { + public: + // Constructs an empty map. + explicit ZoneMap(Zone* zone) + : std::map<K, V, Compare, ZoneAllocator<std::pair<const K, V>>>( + Compare(), ZoneAllocator<std::pair<const K, V>>(zone)) {} +}; + +// A wrapper subclass for std::unordered_map to make it easy to construct one +// that uses a zone allocator. +// Used in regexp-bytecode-peephole.cc +template <typename K, typename V, typename Hash = std::hash<K>, + typename KeyEqual = std::equal_to<K>> +class ZoneUnorderedMap + : public std::unordered_map<K, V, Hash, KeyEqual, + ZoneAllocator<std::pair<const K, V>>> { + public: + // Constructs an empty map. + explicit ZoneUnorderedMap(Zone* zone, size_t bucket_count = 100) + : std::unordered_map<K, V, Hash, KeyEqual, + ZoneAllocator<std::pair<const K, V>>>( + bucket_count, Hash(), KeyEqual(), + ZoneAllocator<std::pair<const K, V>>(zone)) {} +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_UTIL_FLAG_H_ |