diff options
Diffstat (limited to '')
-rw-r--r-- | js/src/irregexp/RegExpAPI.cpp | 916 |
1 files changed, 916 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpAPI.cpp b/js/src/irregexp/RegExpAPI.cpp new file mode 100644 index 0000000000..a008d454f1 --- /dev/null +++ b/js/src/irregexp/RegExpAPI.cpp @@ -0,0 +1,916 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "irregexp/RegExpAPI.h" + +#include "mozilla/ArrayUtils.h" +#include "mozilla/Casting.h" + +#include "frontend/FrontendContext.h" // AutoReportFrontendContext +#include "frontend/TokenStream.h" +#include "gc/GC.h" +#include "gc/Zone.h" +#include "irregexp/imported/regexp-ast.h" +#include "irregexp/imported/regexp-bytecode-generator.h" +#include "irregexp/imported/regexp-compiler.h" +#include "irregexp/imported/regexp-interpreter.h" +#include "irregexp/imported/regexp-macro-assembler-arch.h" +#include "irregexp/imported/regexp-macro-assembler-tracer.h" +#include "irregexp/imported/regexp-parser.h" +#include "irregexp/imported/regexp-stack.h" +#include "irregexp/imported/regexp.h" +#include "irregexp/RegExpNativeMacroAssembler.h" +#include "irregexp/RegExpShim.h" +#include "jit/JitCommon.h" +#include "js/friend/ErrorMessages.h" // JSMSG_* +#include "js/friend/StackLimits.h" // js::ReportOverRecursed +#include "util/StringBuffer.h" +#include "vm/MatchPairs.h" +#include "vm/PlainObject.h" +#include "vm/RegExpShared.h" + +namespace js { +namespace irregexp { + +using mozilla::AssertedCast; +using mozilla::Maybe; +using mozilla::Nothing; +using mozilla::PointerRangeSize; +using mozilla::Some; + +using frontend::DummyTokenStream; +using frontend::TokenStreamAnyChars; + +using v8::internal::DisallowGarbageCollection; +using v8::internal::HandleScope; +using v8::internal::InputOutputData; +using v8::internal::IrregexpInterpreter; +using v8::internal::NativeRegExpMacroAssembler; +using v8::internal::RegExpBytecodeGenerator; +using v8::internal::RegExpCapture; +using v8::internal::RegExpCompileData; +using v8::internal::RegExpCompiler; +using v8::internal::RegExpError; +using v8::internal::RegExpMacroAssembler; +using v8::internal::RegExpMacroAssemblerTracer; +using v8::internal::RegExpNode; +using v8::internal::RegExpParser; +using v8::internal::SMRegExpMacroAssembler; +using v8::internal::Zone; +using v8::internal::ZoneVector; + +using V8HandleString = v8::internal::Handle<v8::internal::String>; +using V8HandleRegExp = v8::internal::Handle<v8::internal::JSRegExp>; + +using namespace v8::internal::regexp_compiler_constants; + +static uint32_t ErrorNumber(RegExpError err) { + switch (err) { + case RegExpError::kNone: + return JSMSG_NOT_AN_ERROR; + case RegExpError::kStackOverflow: + return JSMSG_OVER_RECURSED; + case RegExpError::kAnalysisStackOverflow: + return JSMSG_OVER_RECURSED; + case RegExpError::kTooLarge: + return JSMSG_TOO_MANY_PARENS; + case RegExpError::kUnterminatedGroup: + return JSMSG_MISSING_PAREN; + case RegExpError::kUnmatchedParen: + return JSMSG_UNMATCHED_RIGHT_PAREN; + case RegExpError::kEscapeAtEndOfPattern: + return JSMSG_ESCAPE_AT_END_OF_REGEXP; + case RegExpError::kInvalidPropertyName: + return JSMSG_INVALID_PROPERTY_NAME; + case RegExpError::kInvalidEscape: + return JSMSG_INVALID_IDENTITY_ESCAPE; + case RegExpError::kInvalidDecimalEscape: + return JSMSG_INVALID_DECIMAL_ESCAPE; + case RegExpError::kInvalidUnicodeEscape: + return JSMSG_INVALID_UNICODE_ESCAPE; + case RegExpError::kNothingToRepeat: + return JSMSG_NOTHING_TO_REPEAT; + case RegExpError::kLoneQuantifierBrackets: + // Note: V8 reports the same error for both ']' and '}'. + return JSMSG_RAW_BRACKET_IN_REGEXP; + case RegExpError::kRangeOutOfOrder: + return JSMSG_NUMBERS_OUT_OF_ORDER; + case RegExpError::kIncompleteQuantifier: + return JSMSG_INCOMPLETE_QUANTIFIER; + case RegExpError::kInvalidQuantifier: + return JSMSG_INVALID_QUANTIFIER; + case RegExpError::kInvalidGroup: + return JSMSG_INVALID_GROUP; + case RegExpError::kMultipleFlagDashes: + case RegExpError::kRepeatedFlag: + case RegExpError::kInvalidFlagGroup: + // V8 contains experimental support for turning regexp flags on + // and off in the middle of a regular expression. Unless it + // becomes standardized, SM does not support this feature. + MOZ_CRASH("Mode modifiers not supported"); + case RegExpError::kNotLinear: + // V8 has an experimental non-backtracking engine. We do not + // support it yet. + MOZ_CRASH("Non-backtracking execution not supported"); + case RegExpError::kTooManyCaptures: + return JSMSG_TOO_MANY_PARENS; + case RegExpError::kInvalidCaptureGroupName: + return JSMSG_INVALID_CAPTURE_NAME; + case RegExpError::kDuplicateCaptureGroupName: + return JSMSG_DUPLICATE_CAPTURE_NAME; + case RegExpError::kInvalidNamedReference: + return JSMSG_INVALID_NAMED_REF; + case RegExpError::kInvalidNamedCaptureReference: + return JSMSG_INVALID_NAMED_CAPTURE_REF; + case RegExpError::kInvalidClassEscape: + return JSMSG_RANGE_WITH_CLASS_ESCAPE; + case RegExpError::kInvalidClassPropertyName: + return JSMSG_INVALID_CLASS_PROPERTY_NAME; + case RegExpError::kInvalidCharacterClass: + return JSMSG_RANGE_WITH_CLASS_ESCAPE; + case RegExpError::kUnterminatedCharacterClass: + return JSMSG_UNTERM_CLASS; + case RegExpError::kOutOfOrderCharacterClass: + return JSMSG_BAD_CLASS_RANGE; + + case RegExpError::kInvalidClassSetOperation: + case RegExpError::kInvalidCharacterInClass: + case RegExpError::kNegatedCharacterClassWithStrings: + // TODO: implement support for /v flag (bug 1713657) + MOZ_CRASH("Unicode sets not supported"); + + case RegExpError::NumErrors: + MOZ_CRASH("Unreachable"); + } + MOZ_CRASH("Unreachable"); +} + +Isolate* CreateIsolate(JSContext* cx) { + auto isolate = MakeUnique<Isolate>(cx); + if (!isolate || !isolate->init()) { + return nullptr; + } + return isolate.release(); +} + +void TraceIsolate(JSTracer* trc, Isolate* isolate) { isolate->trace(trc); } + +void DestroyIsolate(Isolate* isolate) { + MOZ_ASSERT(isolate->liveHandles() == 0); + MOZ_ASSERT(isolate->livePseudoHandles() == 0); + js_delete(isolate); +} + +size_t IsolateSizeOfIncludingThis(Isolate* isolate, + mozilla::MallocSizeOf mallocSizeOf) { + return isolate->sizeOfIncludingThis(mallocSizeOf); +} + +static size_t ComputeColumn(const Latin1Char* begin, const Latin1Char* end) { + return PointerRangeSize(begin, end); +} + +static size_t ComputeColumn(const char16_t* begin, const char16_t* end) { + return unicode::CountCodePoints(begin, end); +} + +// This function is varargs purely so it can call ReportCompileErrorLatin1. +// We never call it with additional arguments. +template <typename CharT> +static void ReportSyntaxError(TokenStreamAnyChars& ts, + mozilla::Maybe<uint32_t> line, + mozilla::Maybe<uint32_t> column, + RegExpCompileData& result, CharT* start, + size_t length, ...) { + MOZ_ASSERT(line.isSome() == column.isSome()); + + Maybe<gc::AutoSuppressGC> suppressGC; + if (JSContext* maybeCx = ts.context()->maybeCurrentJSContext()) { + suppressGC.emplace(maybeCx); + } + uint32_t errorNumber = ErrorNumber(result.error); + + if (errorNumber == JSMSG_OVER_RECURSED) { + ReportOverRecursed(ts.context()); + return; + } + + uint32_t offset = std::max(result.error_pos, 0); + MOZ_ASSERT(offset <= length); + + ErrorMetadata err; + + // Ordinarily this indicates whether line-of-context information can be + // added, but we entirely ignore that here because we create a + // a line of context based on the expression source. + uint32_t location = ts.currentToken().pos.begin; + if (ts.fillExceptingContext(&err, location)) { + uint32_t columnNumber = + AssertedCast<uint32_t>(ComputeColumn(start, start + offset)); + if (line.isSome()) { + // If this pattern is being checked by the frontend Parser instead + // of other API entry points like |new RegExp|, then the parser will + // have provided both a line and column pointing at the *beginning* + // of the RegExp literal inside the source text. + // We adjust the columnNumber to point to the actual syntax error + // inside the literal. + err.lineNumber = *line; + err.columnNumber = *column + columnNumber; + } else { + // Line breaks are not significant in pattern text in the same way as + // in source text, so act as though pattern text is a single line, then + // compute a column based on "code point" count (treating a lone + // surrogate as a "code point" in UTF-16). Gak. + err.lineNumber = 1; + err.columnNumber = columnNumber; + } + } + + // For most error reporting, the line of context derives from the token + // stream. So when location information doesn't come from the token + // stream, we can't give a line of context. But here the "line of context" + // can be (and is) derived from the pattern text, so we can provide it no + // matter if the location is derived from the caller. + + const CharT* windowStart = + (offset > ErrorMetadata::lineOfContextRadius) + ? start + (offset - ErrorMetadata::lineOfContextRadius) + : start; + + const CharT* windowEnd = + (length - offset > ErrorMetadata::lineOfContextRadius) + ? start + offset + ErrorMetadata::lineOfContextRadius + : start + length; + + size_t windowLength = PointerRangeSize(windowStart, windowEnd); + MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2); + + // Create the windowed string, not including the potential line + // terminator. + StringBuffer windowBuf(ts.context()); + if (!windowBuf.append(windowStart, windowEnd)) { + return; + } + + // The line of context must be null-terminated, and StringBuffer doesn't + // make that happen unless we force it to. + if (!windowBuf.append('\0')) { + return; + } + + err.lineOfContext.reset(windowBuf.stealChars()); + if (!err.lineOfContext) { + return; + } + + err.lineLength = windowLength; + err.tokenOffset = offset - (windowStart - start); + + va_list args; + va_start(args, length); + ReportCompileErrorLatin1(ts.context(), std::move(err), nullptr, errorNumber, + &args); + va_end(args); +} + +static void ReportSyntaxError(TokenStreamAnyChars& ts, + RegExpCompileData& result, + Handle<JSAtom*> pattern) { + JS::AutoCheckCannotGC nogc_; + if (pattern->hasLatin1Chars()) { + ReportSyntaxError(ts, Nothing(), Nothing(), result, + pattern->latin1Chars(nogc_), pattern->length()); + } else { + ReportSyntaxError(ts, Nothing(), Nothing(), result, + pattern->twoByteChars(nogc_), pattern->length()); + } +} + +template <typename CharT> +static bool CheckPatternSyntaxImpl(js::LifoAlloc& alloc, + JS::NativeStackLimit stackLimit, + const CharT* input, uint32_t inputLength, + JS::RegExpFlags flags, + RegExpCompileData* result, + JS::AutoAssertNoGC& nogc) { + LifoAllocScope allocScope(&alloc); + Zone zone(allocScope.alloc()); + + return RegExpParser::VerifyRegExpSyntax(&zone, stackLimit, input, inputLength, + flags, result, nogc); +} + +bool CheckPatternSyntax(js::LifoAlloc& alloc, JS::NativeStackLimit stackLimit, + TokenStreamAnyChars& ts, + const mozilla::Range<const char16_t> chars, + JS::RegExpFlags flags, mozilla::Maybe<uint32_t> line, + mozilla::Maybe<uint32_t> column) { + RegExpCompileData result; + JS::AutoAssertNoGC nogc; + if (!CheckPatternSyntaxImpl(alloc, stackLimit, chars.begin().get(), + chars.length(), flags, &result, nogc)) { + ReportSyntaxError(ts, line, column, result, chars.begin().get(), + chars.length()); + return false; + } + return true; +} + +bool CheckPatternSyntax(JSContext* cx, JS::NativeStackLimit stackLimit, + TokenStreamAnyChars& ts, Handle<JSAtom*> pattern, + JS::RegExpFlags flags) { + RegExpCompileData result; + JS::AutoAssertNoGC nogc(cx); + if (pattern->hasLatin1Chars()) { + if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit, + pattern->latin1Chars(nogc), pattern->length(), + flags, &result, nogc)) { + ReportSyntaxError(ts, result, pattern); + return false; + } + return true; + } + if (!CheckPatternSyntaxImpl(cx->tempLifoAlloc(), stackLimit, + pattern->twoByteChars(nogc), pattern->length(), + flags, &result, nogc)) { + ReportSyntaxError(ts, result, pattern); + return false; + } + return true; +} + +// A regexp is a good candidate for Boyer-Moore if it has at least 3 +// times as many characters as it has unique characters. Note that +// table lookups in irregexp are done modulo tableSize (128). +template <typename CharT> +static bool HasFewDifferentCharacters(const CharT* chars, size_t length) { + const uint32_t tableSize = + v8::internal::NativeRegExpMacroAssembler::kTableSize; + bool character_found[tableSize] = {}; + uint32_t different = 0; + for (uint32_t i = 0; i < length; i++) { + uint32_t ch = chars[i] % tableSize; + if (!character_found[ch]) { + character_found[ch] = true; + different++; + // We declare a regexp low-alphabet if it has at least 3 times as many + // characters as it has different characters. + if (different * 3 > length) { + return false; + } + } + } + return true; +} + +// Identifies the sort of pattern where Boyer-Moore is faster than string search +static bool UseBoyerMoore(Handle<JSAtom*> pattern, JS::AutoAssertNoGC& nogc) { + size_t length = + std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length()); + if (length <= kPatternTooShortForBoyerMoore) { + return false; + } + + if (pattern->hasLatin1Chars()) { + return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length); + } + MOZ_ASSERT(pattern->hasTwoByteChars()); + return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length); +} + +// Sample character frequency information for use in Boyer-Moore. +static void SampleCharacters(Handle<JSLinearString*> sample_subject, + RegExpCompiler& compiler) { + static const int kSampleSize = 128; + int chars_sampled = 0; + + int length = sample_subject->length(); + + int half_way = (length - kSampleSize) / 2; + for (int i = std::max(0, half_way); i < length && chars_sampled < kSampleSize; + i++, chars_sampled++) { + compiler.frequency_collator()->CountCharacter( + sample_subject->latin1OrTwoByteChar(i)); + } +} + +// Recursively walking the AST for a deeply nested regexp (like +// `/(a(a(a(a(a(a(a(...(a)...))))))))/`) may overflow the stack while +// compiling. To avoid this, we use V8's implementation of the Visitor +// pattern to walk the AST first with an overly large stack frame. +class RegExpDepthCheck final : public v8::internal::RegExpVisitor { + public: + explicit RegExpDepthCheck(JSContext* cx) : cx_(cx) {} + + bool check(v8::internal::RegExpTree* root) { + return !!root->Accept(this, nullptr); + } + + // Leaf nodes with no children +#define LEAF_DEPTH(Kind) \ + void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \ + uint8_t padding[FRAME_PADDING]; \ + dummy_ = padding; /* Prevent padding from being optimized away.*/ \ + AutoCheckRecursionLimit recursion(cx_); \ + return (void*)recursion.checkDontReport(cx_); \ + } + + LEAF_DEPTH(Assertion) + LEAF_DEPTH(Atom) + LEAF_DEPTH(BackReference) + LEAF_DEPTH(ClassSetOperand) + LEAF_DEPTH(ClassRanges) + LEAF_DEPTH(Empty) + LEAF_DEPTH(Text) +#undef LEAF_DEPTH + + // Wrapper nodes with one child +#define WRAPPER_DEPTH(Kind) \ + void* Visit##Kind(v8::internal::RegExp##Kind* node, void*) override { \ + uint8_t padding[FRAME_PADDING]; \ + dummy_ = padding; /* Prevent padding from being optimized away.*/ \ + AutoCheckRecursionLimit recursion(cx_); \ + if (!recursion.checkDontReport(cx_)) { \ + return nullptr; \ + } \ + return node->body()->Accept(this, nullptr); \ + } + + WRAPPER_DEPTH(Capture) + WRAPPER_DEPTH(Group) + WRAPPER_DEPTH(Lookaround) + WRAPPER_DEPTH(Quantifier) +#undef WRAPPER_DEPTH + + void* VisitAlternative(v8::internal::RegExpAlternative* node, + void*) override { + uint8_t padding[FRAME_PADDING]; + dummy_ = padding; /* Prevent padding from being optimized away.*/ + AutoCheckRecursionLimit recursion(cx_); + if (!recursion.checkDontReport(cx_)) { + return nullptr; + } + for (auto* child : *node->nodes()) { + if (!child->Accept(this, nullptr)) { + return nullptr; + } + } + return (void*)true; + } + void* VisitDisjunction(v8::internal::RegExpDisjunction* node, + void*) override { + uint8_t padding[FRAME_PADDING]; + dummy_ = padding; /* Prevent padding from being optimized away.*/ + AutoCheckRecursionLimit recursion(cx_); + if (!recursion.checkDontReport(cx_)) { + return nullptr; + } + for (auto* child : *node->alternatives()) { + if (!child->Accept(this, nullptr)) { + return nullptr; + } + } + return (void*)true; + } + void* VisitClassSetExpression(v8::internal::RegExpClassSetExpression* node, + void*) override { + uint8_t padding[FRAME_PADDING]; + dummy_ = padding; /* Prevent padding from being optimized away.*/ + AutoCheckRecursionLimit recursion(cx_); + if (!recursion.checkDontReport(cx_)) { + return nullptr; + } + for (auto* child : *node->operands()) { + if (!child->Accept(this, nullptr)) { + return nullptr; + } + } + return (void*)true; + } + + private: + JSContext* cx_; + void* dummy_ = nullptr; + + // This size is picked to be comfortably larger than any + // RegExp*::ToNode stack frame. + static const size_t FRAME_PADDING = 256; +}; + +enum class AssembleResult { + Success, + TooLarge, + OutOfMemory, +}; + +[[nodiscard]] static AssembleResult Assemble( + JSContext* cx, RegExpCompiler* compiler, RegExpCompileData* data, + MutableHandleRegExpShared re, Handle<JSAtom*> pattern, Zone* zone, + bool useNativeCode, bool isLatin1) { + // Because we create a StackMacroAssembler, this function is not allowed + // to GC. If needed, we allocate and throw errors in the caller. + jit::TempAllocator temp(&cx->tempLifoAlloc()); + Maybe<jit::JitContext> jctx; + Maybe<js::jit::StackMacroAssembler> stack_masm; + UniquePtr<RegExpMacroAssembler> masm; + if (useNativeCode) { + NativeRegExpMacroAssembler::Mode mode = + isLatin1 ? NativeRegExpMacroAssembler::LATIN1 + : NativeRegExpMacroAssembler::UC16; + // If we are compiling native code, we need a macroassembler, + // which needs a jit context. + jctx.emplace(cx); + stack_masm.emplace(cx, temp); +#ifdef DEBUG + // It would be much preferable to use `class AutoCreatedBy` here, but we + // may be operating without an assembler at all if `useNativeCode` is + // `false`, so there's no place to put such a call. + stack_masm.ref().pushCreator("Assemble() in RegExpAPI.cpp"); +#endif + uint32_t num_capture_registers = re->pairCount() * 2; + masm = MakeUnique<SMRegExpMacroAssembler>(cx, stack_masm.ref(), zone, mode, + num_capture_registers); + } else { + masm = MakeUnique<RegExpBytecodeGenerator>(cx->isolate, zone); + } + if (!masm) { + ReportOutOfMemory(cx); + return AssembleResult::OutOfMemory; + } + + bool isLargePattern = + pattern->length() > v8::internal::RegExp::kRegExpTooLargeToOptimize; + masm->set_slow_safe(isLargePattern); + if (compiler->optimize()) { + compiler->set_optimize(!isLargePattern); + } + + // When matching a regexp with known maximum length that is anchored + // at the end, we may be able to skip the beginning of long input + // strings. This decision is made here because it depends on + // information in the AST that isn't replicated in the Node + // structure used inside the compiler. + bool is_start_anchored = data->tree->IsAnchoredAtStart(); + bool is_end_anchored = data->tree->IsAnchoredAtEnd(); + int max_length = data->tree->max_match(); + static const int kMaxBacksearchLimit = 1024; + if (is_end_anchored && !is_start_anchored && !re->sticky() && + max_length < kMaxBacksearchLimit) { + masm->SetCurrentPositionFromEnd(max_length); + } + + if (re->global()) { + RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; + if (data->tree->min_match() > 0) { + mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; + } else if (re->unicode()) { + mode = RegExpMacroAssembler::GLOBAL_UNICODE; + } + masm->set_global_mode(mode); + } + + // The masm tracer works as a thin wrapper around another macroassembler. + RegExpMacroAssembler* masm_ptr = masm.get(); +#ifdef DEBUG + UniquePtr<RegExpMacroAssembler> tracer_masm; + if (jit::JitOptions.trace_regexp_assembler) { + tracer_masm = MakeUnique<RegExpMacroAssemblerTracer>(cx->isolate, masm_ptr); + masm_ptr = tracer_masm.get(); + } +#endif + + // Compile the regexp. + V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate); + RegExpCompiler::CompilationResult result = compiler->Assemble( + cx->isolate, masm_ptr, data->node, data->capture_count, wrappedPattern); + + if (useNativeCode) { +#ifdef DEBUG + // See comment referencing `pushCreator` above. + stack_masm.ref().popCreator(); +#endif + } + + if (!result.Succeeded()) { + MOZ_ASSERT(result.error == RegExpError::kTooLarge); + return AssembleResult::TooLarge; + } + if (result.code->value().isUndefined()) { + // SMRegExpMacroAssembler::GetCode returns undefined on OOM. + MOZ_ASSERT(useNativeCode); + return AssembleResult::OutOfMemory; + } + + re->updateMaxRegisters(result.num_registers); + if (useNativeCode) { + // Transfer ownership of the tables from the macroassembler to the + // RegExpShared. + SMRegExpMacroAssembler::TableVector& tables = + static_cast<SMRegExpMacroAssembler*>(masm.get())->tables(); + for (uint32_t i = 0; i < tables.length(); i++) { + if (!re->addTable(std::move(tables[i]))) { + ReportOutOfMemory(cx); + return AssembleResult::OutOfMemory; + } + } + re->setJitCode(v8::internal::Code::cast(*result.code).inner(), isLatin1); + } else { + // Transfer ownership of the bytecode from the HandleScope to the + // RegExpShared. + ByteArray bytecode = + v8::internal::ByteArray::cast(*result.code).takeOwnership(cx->isolate); + uint32_t length = bytecode->length; + re->setByteCode(bytecode.release(), isLatin1); + js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode); + } + + return AssembleResult::Success; +} + +struct RegExpCaptureIndexLess { + bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { + return lhs->index() < rhs->index(); + } +}; + +bool InitializeNamedCaptures(JSContext* cx, HandleRegExpShared re, + ZoneVector<RegExpCapture*>* namedCaptures) { + // The irregexp parser returns named capture information in the form + // of a ZoneVector of RegExpCaptures nodes, each of which stores the + // capture name and the corresponding capture index. We create a + // template object with a property for each capture name, and store + // the capture indices as a heap-allocated array. + uint32_t numNamedCaptures = namedCaptures->size(); + + // Named captures are sorted by name (because the set is used to ensure + // name uniqueness). But the capture name map must be sorted by index. + std::sort(namedCaptures->begin(), namedCaptures->end(), + RegExpCaptureIndexLess{}); + + // Create a plain template object. + Rooted<js::PlainObject*> templateObject( + cx, js::NewPlainObjectWithProto(cx, nullptr, TenuredObject)); + if (!templateObject) { + return false; + } + + // Allocate the capture index array. + uint32_t arraySize = numNamedCaptures * sizeof(uint32_t); + UniquePtr<uint32_t[], JS::FreePolicy> captureIndices( + static_cast<uint32_t*>(js_malloc(arraySize))); + if (!captureIndices) { + js::ReportOutOfMemory(cx); + return false; + } + + // Initialize the properties of the template and populate the + // capture index array. + RootedId id(cx); + RootedValue dummyString(cx, StringValue(cx->runtime()->emptyString)); + for (uint32_t i = 0; i < numNamedCaptures; i++) { + RegExpCapture* capture = (*namedCaptures)[i]; + JSAtom* name = + js::AtomizeChars(cx, capture->name()->data(), capture->name()->size()); + if (!name) { + return false; + } + id = NameToId(name->asPropertyName()); + if (!NativeDefineDataProperty(cx, templateObject, id, dummyString, + JSPROP_ENUMERATE)) { + return false; + } + captureIndices[i] = capture->index(); + } + + RegExpShared::InitializeNamedCaptures( + cx, re, numNamedCaptures, templateObject, captureIndices.release()); + return true; +} + +bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, + RegExpShared::CodeKind codeKind) { + Rooted<JSAtom*> pattern(cx, re->getSource()); + JS::RegExpFlags flags = re->getFlags(); + LifoAllocScope allocScope(&cx->tempLifoAlloc()); + HandleScope handleScope(cx->isolate); + Zone zone(allocScope.alloc()); + + RegExpCompileData data; + { + V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate); + if (!RegExpParser::ParseRegExpFromHeapString( + cx->isolate, &zone, wrappedPattern, flags, &data)) { + AutoReportFrontendContext fc(cx); + JS::CompileOptions options(cx); + DummyTokenStream dummyTokenStream(&fc, options); + ReportSyntaxError(dummyTokenStream, data, pattern); + return false; + } + } + + // Avoid stack overflow while recursively walking the AST. + RegExpDepthCheck depthCheck(cx); + if (!depthCheck.check(data.tree)) { + JS_ReportErrorASCII(cx, "regexp too big"); + cx->reportResourceExhaustion(); + return false; + } + + if (re->kind() == RegExpShared::Kind::Unparsed) { + // This is the first time we have compiled this regexp. + // First, check to see if we should use simple string search + // with an atom. + if (!flags.ignoreCase() && !flags.sticky()) { + Rooted<JSAtom*> searchAtom(cx); + if (data.simple) { + // The parse-tree is a single atom that is equal to the pattern. + searchAtom = re->getSource(); + } else if (data.tree->IsAtom() && data.capture_count == 0) { + // The parse-tree is a single atom that is not equal to the pattern. + v8::internal::RegExpAtom* atom = data.tree->AsAtom(); + const char16_t* twoByteChars = atom->data().begin(); + searchAtom = AtomizeChars(cx, twoByteChars, atom->length()); + if (!searchAtom) { + return false; + } + } + JS::AutoAssertNoGC nogc(cx); + if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) { + re->useAtomMatch(searchAtom); + return true; + } + } + if (data.named_captures) { + if (!InitializeNamedCaptures(cx, re, data.named_captures)) { + return false; + } + } + // All fallible initialization has succeeded, so we can change state. + // Add one to capture_count to account for the whole-match capture. + uint32_t pairCount = data.capture_count + 1; + re->useRegExpMatch(pairCount); + } + + MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp); + + RegExpCompiler compiler(cx->isolate, &zone, data.capture_count, flags, + input->hasLatin1Chars()); + + bool isLatin1 = input->hasLatin1Chars(); + + SampleCharacters(input, compiler); + data.node = compiler.PreprocessRegExp(&data, flags, isLatin1); + data.error = AnalyzeRegExp(cx->isolate, isLatin1, flags, data.node); + if (data.error != RegExpError::kNone) { + MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow); + ReportOverRecursed(cx); + return false; + } + + bool useNativeCode = codeKind == RegExpShared::CodeKind::Jitcode; + MOZ_ASSERT_IF(useNativeCode, IsNativeRegExpEnabled()); + + switch (Assemble(cx, &compiler, &data, re, pattern, &zone, useNativeCode, + isLatin1)) { + case AssembleResult::TooLarge: + JS_ReportErrorASCII(cx, "regexp too big"); + cx->reportResourceExhaustion(); + return false; + case AssembleResult::OutOfMemory: + MOZ_ASSERT(cx->isThrowingOutOfMemory()); + return false; + case AssembleResult::Success: + break; + } + return true; +} + +template <typename CharT> +RegExpRunStatus ExecuteRaw(jit::JitCode* code, const CharT* chars, + size_t length, size_t startIndex, + VectorMatchPairs* matches) { + InputOutputData data(chars, chars + length, startIndex, matches); + + static_assert(RegExpRunStatus_Error == + v8::internal::RegExp::kInternalRegExpException); + static_assert(RegExpRunStatus_Success == + v8::internal::RegExp::kInternalRegExpSuccess); + static_assert(RegExpRunStatus_Success_NotFound == + v8::internal::RegExp::kInternalRegExpFailure); + + typedef int (*RegExpCodeSignature)(InputOutputData*); + auto function = reinterpret_cast<RegExpCodeSignature>(code->raw()); + { + JS::AutoSuppressGCAnalysis nogc; + return (RegExpRunStatus)CALL_GENERATED_1(function, &data); + } +} + +RegExpRunStatus Interpret(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, size_t startIndex, + VectorMatchPairs* matches) { + MOZ_ASSERT(re->getByteCode(input->hasLatin1Chars())); + + HandleScope handleScope(cx->isolate); + V8HandleRegExp wrappedRegExp(v8::internal::JSRegExp(re), cx->isolate); + V8HandleString wrappedInput(v8::internal::String(input), cx->isolate); + + static_assert(RegExpRunStatus_Error == + v8::internal::RegExp::kInternalRegExpException); + static_assert(RegExpRunStatus_Success == + v8::internal::RegExp::kInternalRegExpSuccess); + static_assert(RegExpRunStatus_Success_NotFound == + v8::internal::RegExp::kInternalRegExpFailure); + + RegExpRunStatus status = + (RegExpRunStatus)IrregexpInterpreter::MatchForCallFromRuntime( + cx->isolate, wrappedRegExp, wrappedInput, matches->pairsRaw(), + uint32_t(matches->pairCount() * 2), uint32_t(startIndex)); + + MOZ_ASSERT(status == RegExpRunStatus_Error || + status == RegExpRunStatus_Success || + status == RegExpRunStatus_Success_NotFound); + + return status; +} + +RegExpRunStatus Execute(JSContext* cx, MutableHandleRegExpShared re, + Handle<JSLinearString*> input, size_t startIndex, + VectorMatchPairs* matches) { + bool latin1 = input->hasLatin1Chars(); + jit::JitCode* jitCode = re->getJitCode(latin1); + bool isCompiled = !!jitCode; + + // Reset the Irregexp backtrack stack if it grows during execution. + irregexp::RegExpStackScope stackScope(cx->isolate); + + if (isCompiled) { + JS::AutoCheckCannotGC nogc; + if (latin1) { + return ExecuteRaw(jitCode, input->latin1Chars(nogc), input->length(), + startIndex, matches); + } + return ExecuteRaw(jitCode, input->twoByteChars(nogc), input->length(), + startIndex, matches); + } + + return Interpret(cx, re, input, startIndex, matches); +} + +RegExpRunStatus ExecuteForFuzzing(JSContext* cx, Handle<JSAtom*> pattern, + Handle<JSLinearString*> input, + JS::RegExpFlags flags, size_t startIndex, + VectorMatchPairs* matches, + RegExpShared::CodeKind codeKind) { + RootedRegExpShared re(cx, cx->zone()->regExps().get(cx, pattern, flags)); + if (!RegExpShared::compileIfNecessary(cx, &re, input, codeKind)) { + return RegExpRunStatus_Error; + } + return RegExpShared::execute(cx, &re, input, startIndex, matches); +} + +bool GrowBacktrackStack(RegExpStack* regexp_stack) { + return SMRegExpMacroAssembler::GrowBacktrackStack(regexp_stack); +} + +uint32_t CaseInsensitiveCompareNonUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength) { + return SMRegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( + substring1, substring2, byteLength); +} + +uint32_t CaseInsensitiveCompareUnicode(const char16_t* substring1, + const char16_t* substring2, + size_t byteLength) { + return SMRegExpMacroAssembler::CaseInsensitiveCompareUnicode( + substring1, substring2, byteLength); +} + +bool IsCharacterInRangeArray(uint32_t c, ByteArrayData* ranges) { + return SMRegExpMacroAssembler::IsCharacterInRangeArray(c, ranges); +} + +#ifdef DEBUG +bool IsolateShouldSimulateInterrupt(Isolate* isolate) { + return isolate->shouldSimulateInterrupt_ != 0; +} + +void IsolateSetShouldSimulateInterrupt(Isolate* isolate) { + isolate->shouldSimulateInterrupt_ = 1; +} +void IsolateClearShouldSimulateInterrupt(Isolate* isolate) { + isolate->shouldSimulateInterrupt_ = 0; +} +#endif + +} // namespace irregexp +} // namespace js |