diff options
Diffstat (limited to 'js/src/irregexp/RegExpShim.h')
-rw-r--r-- | js/src/irregexp/RegExpShim.h | 1283 |
1 files changed, 1283 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpShim.h b/js/src/irregexp/RegExpShim.h new file mode 100644 index 0000000000..e503ffb096 --- /dev/null +++ b/js/src/irregexp/RegExpShim.h @@ -0,0 +1,1283 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef RegexpShim_h +#define RegexpShim_h + +#include "mozilla/Assertions.h" +#include "mozilla/Attributes.h" +#include "mozilla/MathAlgorithms.h" +#include "mozilla/Maybe.h" +#include "mozilla/SegmentedVector.h" +#include "mozilla/Sprintf.h" +#include "mozilla/Types.h" + +#include <algorithm> +#include <cctype> +#include <iterator> + +#include "irregexp/RegExpTypes.h" +#include "irregexp/util/FlagsShim.h" +#include "irregexp/util/VectorShim.h" +#include "irregexp/util/ZoneShim.h" +#include "jit/JitCode.h" +#include "jit/Label.h" +#include "jit/shared/Assembler-shared.h" +#include "js/friend/StackLimits.h" // js::AutoCheckRecursionLimit +#include "js/RegExpFlags.h" +#include "js/Value.h" +#include "threading/ExclusiveData.h" +#include "util/DifferentialTesting.h" +#include "vm/JSContext.h" +#include "vm/MutexIDs.h" +#include "vm/NativeObject.h" +#include "vm/RegExpShared.h" + +// Forward declaration of classes +namespace v8 { +namespace internal { + +class Heap; +class Isolate; +class RegExpMatchInfo; +class RegExpStack; + +template <typename T> +class Handle; + +} // namespace internal +} // namespace v8 + +#define V8_WARN_UNUSED_RESULT [[nodiscard]] +#define V8_EXPORT_PRIVATE +#define V8_FALLTHROUGH [[fallthrough]] +#define V8_NODISCARD [[nodiscard]] +#define V8_NOEXCEPT noexcept + +#define FATAL(x) MOZ_CRASH(x) +#define UNREACHABLE() MOZ_CRASH("unreachable code") +#define UNIMPLEMENTED() MOZ_CRASH("unimplemented code") +#define STATIC_ASSERT(exp) static_assert(exp, #exp) + +#define DCHECK MOZ_ASSERT +#define DCHECK_EQ(lhs, rhs) MOZ_ASSERT((lhs) == (rhs)) +#define DCHECK_NE(lhs, rhs) MOZ_ASSERT((lhs) != (rhs)) +#define DCHECK_GT(lhs, rhs) MOZ_ASSERT((lhs) > (rhs)) +#define DCHECK_GE(lhs, rhs) MOZ_ASSERT((lhs) >= (rhs)) +#define DCHECK_LT(lhs, rhs) MOZ_ASSERT((lhs) < (rhs)) +#define DCHECK_LE(lhs, rhs) MOZ_ASSERT((lhs) <= (rhs)) +#define DCHECK_NULL(val) MOZ_ASSERT((val) == nullptr) +#define DCHECK_NOT_NULL(val) MOZ_ASSERT((val) != nullptr) +#define DCHECK_IMPLIES(lhs, rhs) MOZ_ASSERT_IF(lhs, rhs) +#define CHECK MOZ_RELEASE_ASSERT +#define CHECK_EQ(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) == (rhs)) +#define CHECK_LE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) <= (rhs)) +#define CHECK_GE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) >= (rhs)) +#define CONSTEXPR_DCHECK MOZ_ASSERT + +#define MemCopy memcpy + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L310-L319 +// ptrdiff_t is 't' according to the standard, but MSVC uses 'I'. +#ifdef _MSC_VER +# define V8PRIxPTRDIFF "Ix" +# define V8PRIdPTRDIFF "Id" +# define V8PRIuPTRDIFF "Iu" +#else +# define V8PRIxPTRDIFF "tx" +# define V8PRIdPTRDIFF "td" +# define V8PRIuPTRDIFF "tu" +#endif + +#define arraysize std::size + +// Explicitly declare the assignment operator as deleted. +#define DISALLOW_ASSIGN(TypeName) TypeName& operator=(const TypeName&) = delete + +// Explicitly declare the copy constructor and assignment operator as deleted. +// This also deletes the implicit move constructor and implicit move assignment +// operator, but still allows to manually define them. +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + DISALLOW_ASSIGN(TypeName) + +// Explicitly declare all implicit constructors as deleted, namely the +// default constructor, copy constructor and operator= functions. +// This is especially useful for classes containing only static methods. +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName() = delete; \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +namespace v8 { + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L364-L367 +template <typename T, typename U> +constexpr inline bool IsAligned(T value, U alignment) { + return (value & (alignment - 1)) == 0; +} + +using byte = uint8_t; +using Address = uintptr_t; +static const Address kNullAddress = 0; + +inline uintptr_t GetCurrentStackPosition() { + return reinterpret_cast<uintptr_t>(__builtin_frame_address(0)); +} + +namespace base { + +// Latin1/UTF-16 constants +// Code-point values in Unicode 4.0 are 21 bits wide. +// Code units in UTF-16 are 16 bits wide. +using uc16 = char16_t; +using uc32 = uint32_t; + +constexpr int kUC16Size = sizeof(base::uc16); + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/macros.h#L247-L258 +// The USE(x, ...) template is used to silence C++ compiler warnings +// issued for (yet) unused variables (typically parameters). +// The arguments are guaranteed to be evaluated from left to right. +struct Use { + template <typename T> + Use(T&&) {} // NOLINT(runtime/explicit) +}; +#define USE(...) \ + do { \ + ::v8::base::Use unused_tmp_array_for_use_macro[]{__VA_ARGS__}; \ + (void)unused_tmp_array_for_use_macro; \ + } while (false) + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/base/safe_conversions.h#L35-L39 +// saturated_cast<> is analogous to static_cast<> for numeric types, except +// that the specified numeric conversion will saturate rather than overflow or +// underflow. +template <typename Dst, typename Src> +inline Dst saturated_cast(Src value); + +// This is the only specialization that is needed for regexp code. +// Instead of pulling in dozens of lines of template goo +// to derive it, I used the implementation from uint8_clamped in +// ArrayBufferObject.h. +template <> +inline uint8_t saturated_cast<uint8_t, int>(int x) { + return (x >= 0) ? ((x < 255) ? uint8_t(x) : 255) : 0; +} + +// Origin: +// https://github.com/v8/v8/blob/fc088cdaccadede84886eee881e67af9db53669a/src/base/bounds.h#L14-L28 +// Checks if value is in range [lower_limit, higher_limit] using a single +// branch. +template <typename T, typename U> +inline constexpr bool IsInRange(T value, U lower_limit, U higher_limit) { + using unsigned_T = typename std::make_unsigned<T>::type; + // Use static_cast to support enum classes. + return static_cast<unsigned_T>(static_cast<unsigned_T>(value) - + static_cast<unsigned_T>(lower_limit)) <= + static_cast<unsigned_T>(static_cast<unsigned_T>(higher_limit) - + static_cast<unsigned_T>(lower_limit)); +} + +#define LAZY_INSTANCE_INITIALIZER \ + {} + +template <typename T> +class LazyInstanceImpl { + public: + LazyInstanceImpl() : value_(js::mutexid::IrregexpLazyStatic) {} + + const T* Pointer() { + auto val = value_.lock(); + if (val->isNothing()) { + val->emplace(); + } + return val->ptr(); + } + + private: + js::ExclusiveData<mozilla::Maybe<T>> value_; +}; + +template <typename T> +class LazyInstance { + public: + using type = LazyInstanceImpl<T>; +}; + +// Origin: +// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/utils.h#L40-L48 +// Returns the value (0 .. 15) of a hexadecimal character c. +// If c is not a legal hexadecimal character, returns a value < 0. +// Used in regexp-parser.cc +inline int HexValue(base::uc32 c) { + c -= '0'; + if (static_cast<unsigned>(c) <= 9) return c; + c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. + if (static_cast<unsigned>(c) <= 5) return c + 10; + return -1; +} + +template <typename... Args> +[[nodiscard]] uint32_t hash_combine(uint32_t aHash, Args... aArgs) { + return mozilla::AddToHash(aHash, aArgs...); +} + +template <typename T> +class Optional { + mozilla::Maybe<T> inner_; + + public: + Optional() = default; + Optional(T t) { inner_.emplace(t); } + + bool has_value() const { return inner_.isSome(); } + const T& value() const { return inner_.ref(); } +}; + +namespace bits { + +inline uint64_t CountTrailingZeros(uint64_t value) { + return mozilla::CountTrailingZeroes64(value); +} + +inline size_t RoundUpToPowerOfTwo32(size_t value) { + return mozilla::RoundUpPow2(value); +} + +template <typename T> +constexpr bool IsPowerOfTwo(T value) { + return value > 0 && (value & (value - 1)) == 0; +} + +} // namespace bits +} // namespace base + +namespace unibrow { + +using uchar = unsigned int; + +// Origin: +// https://github.com/v8/v8/blob/1f1e4cdb04c75eab77adbecd5f5514ddc3eb56cf/src/strings/unicode.h#L133-L150 +class Latin1 { + public: + static const base::uc16 kMaxChar = 0xff; + + // Convert the character to Latin-1 case equivalent if possible. + static inline base::uc16 TryConvertToLatin1(base::uc16 c) { + // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN". + // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN". + if (c == 0x039C || c == 0x03BC) { + return 0xB5; + } + // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER + // Y WITH DIAERESIS". + if (c == 0x0178) { + return 0xFF; + } + return c; + } +}; + +// Origin: +// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L99-L131 +class Utf16 { + public: + static inline bool IsLeadSurrogate(int code) { + return js::unicode::IsLeadSurrogate(code); + } + static inline bool IsTrailSurrogate(int code) { + return js::unicode::IsTrailSurrogate(code); + } + static inline base::uc16 LeadSurrogate(uint32_t char_code) { + return js::unicode::LeadSurrogate(char_code); + } + static inline base::uc16 TrailSurrogate(uint32_t char_code) { + return js::unicode::TrailSurrogate(char_code); + } + static inline uint32_t CombineSurrogatePair(char16_t lead, char16_t trail) { + return js::unicode::UTF16Decode(lead, trail); + } + static const uchar kMaxNonSurrogateCharCode = 0xffff; +}; + +#ifndef V8_INTL_SUPPORT + +// A cache used in case conversion. It caches the value for characters +// that either have no mapping or map to a single character independent +// of context. Characters that map to more than one character or that +// map differently depending on context are always looked up. +// Origin: +// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L64-L88 +template <class T, int size = 256> +class Mapping { + public: + inline Mapping() = default; + inline int get(uchar c, uchar n, uchar* result) { + CacheEntry entry = entries_[c & kMask]; + if (entry.code_point_ == c) { + if (entry.offset_ == 0) { + return 0; + } else { + result[0] = c + entry.offset_; + return 1; + } + } else { + return CalculateValue(c, n, result); + } + } + + private: + int CalculateValue(uchar c, uchar n, uchar* result) { + bool allow_caching = true; + int length = T::Convert(c, n, result, &allow_caching); + if (allow_caching) { + if (length == 1) { + entries_[c & kMask] = CacheEntry(c, result[0] - c); + return 1; + } else { + entries_[c & kMask] = CacheEntry(c, 0); + return 0; + } + } else { + return length; + } + } + + struct CacheEntry { + inline CacheEntry() : code_point_(kNoChar), offset_(0) {} + inline CacheEntry(uchar code_point, signed offset) + : code_point_(code_point), offset_(offset) {} + uchar code_point_; + signed offset_; + static const int kNoChar = (1 << 21) - 1; + }; + static const int kSize = size; + static const int kMask = kSize - 1; + CacheEntry entries_[kSize]; +}; + +// Origin: +// https://github.com/v8/v8/blob/b4bfbce6f91fc2cc72178af42bb3172c5f5eaebb/src/strings/unicode.h#L241-L252 +struct Ecma262Canonicalize { + static const int kMaxWidth = 1; + static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); +}; +struct Ecma262UnCanonicalize { + static const int kMaxWidth = 4; + static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); +}; +struct CanonicalizationRange { + static const int kMaxWidth = 1; + static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); +}; + +#endif // !V8_INTL_SUPPORT + +struct Letter { + static bool Is(uchar c); +}; + +} // namespace unibrow + +namespace internal { + +#define PRINTF_FORMAT(x, y) MOZ_FORMAT_PRINTF(x, y) +void PRINTF_FORMAT(1, 2) PrintF(const char* format, ...); +void PRINTF_FORMAT(2, 3) PrintF(FILE* out, const char* format, ...); + +// Superclass for classes only using static method functions. +// The subclass of AllStatic cannot be instantiated at all. +class AllStatic { +#ifdef DEBUG + public: + AllStatic() = delete; +#endif +}; + +// Superclass for classes managed with new and delete. +// In irregexp, this is only AlternativeGeneration (in regexp-compiler.cc) +// Compare: +// https://github.com/v8/v8/blob/7b3332844212d78ee87a9426f3a6f7f781a8fbfa/src/utils/allocation.cc#L88-L96 +class Malloced { + public: + static void* operator new(size_t size) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + void* result = js_malloc(size); + if (!result) { + oomUnsafe.crash("Irregexp Malloced shim"); + } + return result; + } + static void operator delete(void* p) { js_free(p); } +}; + +constexpr int32_t KB = 1024; +constexpr int32_t MB = 1024 * 1024; + +#define kMaxInt JSVAL_INT_MAX +#define kMinInt JSVAL_INT_MIN +constexpr int kSystemPointerSize = sizeof(void*); + +// The largest integer n such that n and n + 1 are both exactly +// representable as a Number value. ES6 section 20.1.2.6 +constexpr double kMaxSafeInteger = 9007199254740991.0; // 2^53-1 + +constexpr int kBitsPerByte = 8; +constexpr int kBitsPerByteLog2 = 3; +constexpr int kUInt16Size = sizeof(uint16_t); +constexpr int kUInt32Size = sizeof(uint32_t); +constexpr int kInt64Size = sizeof(int64_t); + +constexpr int kMaxUInt16 = (1 << 16) - 1; + +inline constexpr bool IsDecimalDigit(base::uc32 c) { + return c >= '0' && c <= '9'; +} + +inline constexpr int AsciiAlphaToLower(base::uc32 c) { return c | 0x20; } + +inline bool is_uint24(int64_t val) { return (val >> 24) == 0; } +inline bool is_int24(int64_t val) { + int64_t limit = int64_t(1) << 23; + return (-limit <= val) && (val < limit); +} + +inline bool IsIdentifierStart(base::uc32 c) { + return js::unicode::IsIdentifierStart(char32_t(c)); +} +inline bool IsIdentifierPart(base::uc32 c) { + return js::unicode::IsIdentifierPart(char32_t(c)); +} + +// Wrappers to disambiguate char16_t and uc16. +struct AsUC16 { + explicit AsUC16(char16_t v) : value(v) {} + char16_t value; +}; + +struct AsUC32 { + explicit AsUC32(int32_t v) : value(v) {} + int32_t value; +}; + +std::ostream& operator<<(std::ostream& os, const AsUC16& c); +std::ostream& operator<<(std::ostream& os, const AsUC32& c); + +// This class is used for the output of trace-regexp-parser. V8 has +// an elaborate implementation to ensure that the output gets to the +// right place, even on Android. We just need something that will +// print output (ideally to stderr, to match the rest of our tracing +// code). This is an empty wrapper that will convert itself to +// std::cerr when used. +class StdoutStream { + public: + operator std::ostream&() const; + template <typename T> + std::ostream& operator<<(T t); +}; + +// Reuse existing Maybe implementation +using mozilla::Maybe; + +template <typename T> +Maybe<T> Just(const T& value) { + return mozilla::Some(value); +} + +template <typename T> +mozilla::Nothing Nothing() { + return mozilla::Nothing(); +} + +template <typename T> +using PseudoHandle = mozilla::UniquePtr<T, JS::FreePolicy>; + +// Compare 8bit/16bit chars to 8bit/16bit chars. +// Used indirectly by regexp-interpreter.cc +// Taken from: https://github.com/v8/v8/blob/master/src/utils/utils.h +template <typename lchar, typename rchar> +inline int CompareCharsUnsigned(const lchar* lhs, const rchar* rhs, + size_t chars) { + const lchar* limit = lhs + chars; + if (sizeof(*lhs) == sizeof(char) && sizeof(*rhs) == sizeof(char)) { + // memcmp compares byte-by-byte, yielding wrong results for two-byte + // strings on little-endian systems. + return memcmp(lhs, rhs, chars); + } + while (lhs < limit) { + int r = static_cast<int>(*lhs) - static_cast<int>(*rhs); + if (r != 0) return r; + ++lhs; + ++rhs; + } + return 0; +} +template <typename lchar, typename rchar> +inline int CompareChars(const lchar* lhs, const rchar* rhs, size_t chars) { + DCHECK_LE(sizeof(lchar), 2); + DCHECK_LE(sizeof(rchar), 2); + if (sizeof(lchar) == 1) { + if (sizeof(rchar) == 1) { + return CompareCharsUnsigned(reinterpret_cast<const uint8_t*>(lhs), + reinterpret_cast<const uint8_t*>(rhs), chars); + } else { + return CompareCharsUnsigned(reinterpret_cast<const uint8_t*>(lhs), + reinterpret_cast<const char16_t*>(rhs), + chars); + } + } else { + if (sizeof(rchar) == 1) { + return CompareCharsUnsigned(reinterpret_cast<const char16_t*>(lhs), + reinterpret_cast<const uint8_t*>(rhs), chars); + } else { + return CompareCharsUnsigned(reinterpret_cast<const char16_t*>(lhs), + reinterpret_cast<const char16_t*>(rhs), + chars); + } + } +} + +// Compare 8bit/16bit chars to 8bit/16bit chars. +template <typename lchar, typename rchar> +inline bool CompareCharsEqualUnsigned(const lchar* lhs, const rchar* rhs, + size_t chars) { + STATIC_ASSERT(std::is_unsigned<lchar>::value); + STATIC_ASSERT(std::is_unsigned<rchar>::value); + if (sizeof(*lhs) == sizeof(*rhs)) { + // memcmp compares byte-by-byte, but for equality it doesn't matter whether + // two-byte char comparison is little- or big-endian. + return memcmp(lhs, rhs, chars * sizeof(*lhs)) == 0; + } + for (const lchar* limit = lhs + chars; lhs < limit; ++lhs, ++rhs) { + if (*lhs != *rhs) return false; + } + return true; +} + +template <typename lchar, typename rchar> +inline bool CompareCharsEqual(const lchar* lhs, const rchar* rhs, + size_t chars) { + using ulchar = typename std::make_unsigned<lchar>::type; + using urchar = typename std::make_unsigned<rchar>::type; + return CompareCharsEqualUnsigned(reinterpret_cast<const ulchar*>(lhs), + reinterpret_cast<const urchar*>(rhs), chars); +} + +// V8::Object ~= JS::Value +class Object { + public: + // The default object constructor in V8 stores a nullptr, + // which has its low bit clear and is interpreted as Smi(0). + constexpr Object() : asBits_(JS::Int32Value(0).asRawBits()) {} + + Object(const JS::Value& value) : asBits_(value.asRawBits()) {} + + // This constructor is only used in an unused implementation of + // IsCharacterInRangeArray in regexp-macro-assembler.cc. + Object(uintptr_t raw) : asBits_(raw) { MOZ_CRASH("unused"); } + + // Used in regexp-interpreter.cc to check the return value of + // isolate->stack_guard()->HandleInterrupts(). We want to handle + // interrupts in the caller, so we always return false from + // HandleInterrupts and true here. + inline bool IsException(Isolate*) const { + MOZ_ASSERT(!value().toBoolean()); + return true; + } + + JS::Value value() const { return JS::Value::fromRawBits(asBits_); } + + inline static Object cast(Object object) { return object; } + + protected: + void setValue(const JS::Value& val) { asBits_ = val.asRawBits(); } + uint64_t asBits_; +} JS_HAZ_GC_POINTER; + +class Smi : public Object { + public: + static Smi FromInt(int32_t value) { + Smi smi; + smi.setValue(JS::Int32Value(value)); + return smi; + } + static inline int32_t ToInt(const Object object) { + return object.value().toInt32(); + } +}; + +// V8::HeapObject ~= GC thing +class HeapObject : public Object { + public: + inline static HeapObject cast(Object object) { + HeapObject h; + h.setValue(object.value()); + return h; + } +}; + +// A fixed-size array with Objects (aka Values) as element types. +// Implemented using the dense elements of an ArrayObject. +// Used for named captures. +class FixedArray : public HeapObject { + public: + inline void set(uint32_t index, Object value) { + inner()->setDenseElement(index, value.value()); + } + inline static FixedArray cast(Object object) { + FixedArray f; + f.setValue(object.value()); + return f; + } + js::NativeObject* inner() { + return &value().toObject().as<js::NativeObject>(); + } +}; + +/* + * Conceptually, ByteArrayData is a variable-size structure. To + * implement this in a C++-approved way, we allocate a struct + * containing the 32-bit length field, followed by additional memory + * for the data. To access the data, we get a pointer to the next byte + * after the length field and cast it to the correct type. + */ +inline uint8_t* ByteArrayData::data() { + static_assert(alignof(uint8_t) <= alignof(ByteArrayData), + "The trailing data must be aligned to start immediately " + "after the header with no padding."); + ByteArrayData* immediatelyAfter = this + 1; + return reinterpret_cast<uint8_t*>(immediatelyAfter); +} + +template <typename T> +T* ByteArrayData::typedData() { + static_assert(alignof(T) <= alignof(ByteArrayData)); + MOZ_ASSERT(uintptr_t(data()) % alignof(T) == 0); + return reinterpret_cast<T*>(data()); +} + +template <typename T> +T ByteArrayData::getTyped(uint32_t index) { + MOZ_ASSERT(index < length / sizeof(T)); + return typedData<T>()[index]; +} + +template <typename T> +void ByteArrayData::setTyped(uint32_t index, T value) { + MOZ_ASSERT(index < length / sizeof(T)); + typedData<T>()[index] = value; +} + +// A fixed-size array of bytes. +class ByteArray : public HeapObject { + protected: + ByteArrayData* inner() const { + return static_cast<ByteArrayData*>(value().toPrivate()); + } + + public: + PseudoHandle<ByteArrayData> takeOwnership(Isolate* isolate); + PseudoHandle<ByteArrayData> maybeTakeOwnership(Isolate* isolate); + + byte get(uint32_t index) { return inner()->get(index); } + void set(uint32_t index, byte val) { inner()->set(index, val); } + + uint32_t length() const { return inner()->length; } + byte* GetDataStartAddress() { return inner()->data(); } + + static ByteArray cast(Object object) { + ByteArray b; + b.setValue(object.value()); + return b; + } + + bool IsByteArray() const { return true; } + + friend class SMRegExpMacroAssembler; +}; + +// This is a convenience class used in V8 for treating a ByteArray as an array +// of fixed-size integers. This version supports integral types up to 32 bits. +template <typename T> +class FixedIntegerArray : public ByteArray { + static_assert(alignof(T) <= alignof(ByteArrayData)); + static_assert(std::is_integral<T>::value); + + public: + static Handle<FixedIntegerArray<T>> New(Isolate* isolate, uint32_t length); + + T get(uint32_t index) { return inner()->template getTyped<T>(index); }; + void set(uint32_t index, T value) { + inner()->template setTyped<T>(index, value); + } + + static FixedIntegerArray<T> cast(Object object) { + FixedIntegerArray<T> f; + f.setValue(object.value()); + return f; + } +}; + +using FixedUInt16Array = FixedIntegerArray<uint16_t>; + +// Like Handles in SM, V8 handles are references to marked pointers. +// Unlike SM, where Rooted pointers are created individually on the +// stack, the target of a V8 handle lives in an arena on the isolate +// (~= JSContext). Whenever a Handle is created, a new "root" is +// created at the end of the arena. +// +// HandleScopes are used to manage the lifetimes of these handles. A +// HandleScope lives on the stack and stores the size of the arena at +// the time of its creation. When the function returns and the +// HandleScope is destroyed, the arena is truncated to its previous +// size, clearing all roots that were created since the creation of +// the HandleScope. +// +// In some cases, objects that are GC-allocated in V8 are not in SM. +// In particular, irregexp allocates ByteArrays during code generation +// to store lookup tables. This does not play nicely with the SM +// macroassembler's requirement that no GC allocations take place +// while it is on the stack. To work around this, this shim layer also +// provides the ability to create pseudo-handles, which are not +// managed by the GC but provide the same API to irregexp. The "root" +// of a pseudohandle is a unique pointer living in a second arena. If +// the allocated object should outlive the HandleScope, it must be +// manually moved out of the arena using maybeTakeOwnership. +// (If maybeTakeOwnership is called multiple times, it will return +// a null pointer on subsequent calls.) + +class MOZ_STACK_CLASS HandleScope { + public: + HandleScope(Isolate* isolate); + ~HandleScope(); + + private: + size_t level_ = 0; + size_t non_gc_level_ = 0; + Isolate* isolate_; + + friend class Isolate; +}; + +// Origin: +// https://github.com/v8/v8/blob/5792f3587116503fc047d2f68c951c72dced08a5/src/handles/handles.h#L88-L171 +template <typename T> +class MOZ_NONHEAP_CLASS Handle { + public: + Handle() : location_(nullptr) {} + Handle(T object, Isolate* isolate); + Handle(const JS::Value& value, Isolate* isolate); + + // Constructor for handling automatic up casting. + template <typename S, + typename = std::enable_if_t<std::is_convertible_v<S*, T*>>> + inline Handle(Handle<S> handle) : location_(handle.location_) {} + + inline bool is_null() const { return location_ == nullptr; } + + inline T operator*() const { return T::cast(Object(*location_)); }; + + // {ObjectRef} is returned by {Handle::operator->}. It should never be stored + // anywhere or used in any other code; no one should ever have to spell out + // {ObjectRef} in code. Its only purpose is to be dereferenced immediately by + // "operator-> chaining". Returning the address of the field is valid because + // this object's lifetime only ends at the end of the full statement. + // Origin: + // https://github.com/v8/v8/blob/03aaa4b3bf4cb01eee1f223b252e6869b04ab08c/src/handles/handles.h#L91-L105 + class MOZ_TEMPORARY_CLASS ObjectRef { + public: + T* operator->() { return &object_; } + + private: + friend class Handle; + explicit ObjectRef(T object) : object_(object) {} + + T object_; + }; + inline ObjectRef operator->() const { return ObjectRef{**this}; } + + static Handle<T> fromHandleValue(JS::HandleValue handle) { + return Handle(handle.address()); + } + + private: + Handle(const JS::Value* location) : location_(location) {} + + template <typename> + friend class Handle; + template <typename> + friend class MaybeHandle; + + const JS::Value* location_; +}; + +// A Handle can be converted into a MaybeHandle. Converting a MaybeHandle +// into a Handle requires checking that it does not point to nullptr. This +// ensures nullptr checks before use. +// +// Also note that Handles do not provide default equality comparison or hashing +// operators on purpose. Such operators would be misleading, because intended +// semantics is ambiguous between Handle location and object identity. +// Origin: +// https://github.com/v8/v8/blob/5792f3587116503fc047d2f68c951c72dced08a5/src/handles/maybe-handles.h#L15-L78 +template <typename T> +class MOZ_NONHEAP_CLASS MaybeHandle final { + public: + MaybeHandle() : location_(nullptr) {} + + // Constructor for handling automatic up casting from Handle. + // Ex. Handle<JSArray> can be passed when MaybeHandle<Object> is expected. + template <typename S, + typename = std::enable_if_t<std::is_convertible_v<S*, T*>>> + MaybeHandle(Handle<S> handle) : location_(handle.location_) {} + + inline Handle<T> ToHandleChecked() const { + MOZ_RELEASE_ASSERT(location_); + return Handle<T>(location_); + } + + // Convert to a Handle with a type that can be upcasted to. + template <typename S> + inline bool ToHandle(Handle<S>* out) const { + if (location_) { + *out = Handle<T>(location_); + return true; + } else { + *out = Handle<T>(); + return false; + } + } + + private: + JS::Value* location_; +}; + +// From v8/src/handles/handles-inl.h + +template <typename T> +inline Handle<T> handle(T object, Isolate* isolate) { + return Handle<T>(object, isolate); +} + +// RAII Guard classes + +using DisallowGarbageCollection = JS::AutoAssertNoGC; + +// V8 uses this inside DisallowGarbageCollection regions to turn +// allocation back on before throwing a stack overflow exception or +// handling interrupts. AutoSuppressGC is sufficient for the former +// case, but not for the latter: handling interrupts can execute +// arbitrary script code, and V8 jumps through some scary hoops to +// "manually relocate unhandlified references" afterwards. To keep +// things sane, we don't try to handle interrupts while regex code is +// still on the stack. Instead, we return EXCEPTION and handle +// interrupts in the caller. (See RegExpShared::execute.) + +class AllowGarbageCollection { + public: + AllowGarbageCollection() {} +}; + +// Origin: +// https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L83-L474 +class String : public HeapObject { + private: + JSString* str() const { return value().toString(); } + + public: + String() = default; + String(JSString* str) { setValue(JS::StringValue(str)); } + + operator JSString*() const { return str(); } + + // Max char codes. + static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; + static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; + static const int kMaxUtf16CodeUnit = 0xffff; + static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit; + static const base::uc32 kMaxCodePoint = 0x10ffff; + + MOZ_ALWAYS_INLINE int length() const { return str()->length(); } + bool IsFlat() { return str()->isLinear(); }; + + // Origin: + // https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L95-L152 + class FlatContent { + public: + FlatContent(JSLinearString* string, const DisallowGarbageCollection& no_gc) + : string_(string), no_gc_(no_gc) {} + inline bool IsOneByte() const { return string_->hasLatin1Chars(); } + inline bool IsTwoByte() const { return !string_->hasLatin1Chars(); } + + base::Vector<const uint8_t> ToOneByteVector() const { + MOZ_ASSERT(IsOneByte()); + return base::Vector<const uint8_t>(string_->latin1Chars(no_gc_), + string_->length()); + } + base::Vector<const base::uc16> ToUC16Vector() const { + MOZ_ASSERT(IsTwoByte()); + return base::Vector<const base::uc16>(string_->twoByteChars(no_gc_), + string_->length()); + } + void UnsafeDisableChecksumVerification() { + // Intentional no-op. See the comment for AllowGarbageCollection above. + } + + private: + const JSLinearString* string_; + const JS::AutoAssertNoGC& no_gc_; + }; + FlatContent GetFlatContent(const DisallowGarbageCollection& no_gc) { + MOZ_ASSERT(IsFlat()); + return FlatContent(&str()->asLinear(), no_gc); + } + + static Handle<String> Flatten(Isolate* isolate, Handle<String> string); + + inline static String cast(Object object) { + String s; + MOZ_ASSERT(object.value().isString()); + s.setValue(object.value()); + return s; + } + + inline static bool IsOneByteRepresentationUnderneath(String string) { + return string.str()->hasLatin1Chars(); + } + inline bool IsOneByteRepresentation() const { + return str()->hasLatin1Chars(); + } + + std::unique_ptr<char[]> ToCString(); + + template <typename Char> + base::Vector<const Char> GetCharVector( + const DisallowGarbageCollection& no_gc); +}; + +template <> +inline base::Vector<const uint8_t> String::GetCharVector( + const DisallowGarbageCollection& no_gc) { + String::FlatContent flat = GetFlatContent(no_gc); + MOZ_ASSERT(flat.IsOneByte()); + return flat.ToOneByteVector(); +} + +template <> +inline base::Vector<const base::uc16> String::GetCharVector( + const DisallowGarbageCollection& no_gc) { + String::FlatContent flat = GetFlatContent(no_gc); + MOZ_ASSERT(flat.IsTwoByte()); + return flat.ToUC16Vector(); +} + +class JSRegExp : public HeapObject { + public: + JSRegExp() : HeapObject() {} + JSRegExp(js::RegExpShared* re) { setValue(JS::PrivateGCThingValue(re)); } + + // ****************************************************** + // Methods that are called from inside the implementation + // ****************************************************** + void TierUpTick() { inner()->tierUpTick(); } + + Object bytecode(bool is_latin1) const { + return Object(JS::PrivateValue(inner()->getByteCode(is_latin1))); + } + + // TODO: should we expose this? + uint32_t backtrack_limit() const { return 0; } + + static JSRegExp cast(Object object) { + JSRegExp regexp; + js::gc::Cell* regexpShared = object.value().toGCThing(); + MOZ_ASSERT(regexpShared->is<js::RegExpShared>()); + regexp.setValue(JS::PrivateGCThingValue(regexpShared)); + return regexp; + } + + // Each capture (including the match itself) needs two registers. + static constexpr int RegistersForCaptureCount(int count) { + return (count + 1) * 2; + } + + inline uint32_t max_register_count() const { + return inner()->getMaxRegisters(); + } + + // ****************************** + // Static constants + // ****************************** + + static constexpr int kMaxCaptures = (1 << 15) - 1; + + static constexpr int kNoBacktrackLimit = 0; + + private: + js::RegExpShared* inner() const { + return value().toGCThing()->as<js::RegExpShared>(); + } +}; + +using RegExpFlags = JS::RegExpFlags; + +inline bool IsUnicode(RegExpFlags flags) { return flags.unicode(); } +inline bool IsGlobal(RegExpFlags flags) { return flags.global(); } +inline bool IsIgnoreCase(RegExpFlags flags) { return flags.ignoreCase(); } +inline bool IsMultiline(RegExpFlags flags) { return flags.multiline(); } +inline bool IsDotAll(RegExpFlags flags) { return flags.dotAll(); } +inline bool IsSticky(RegExpFlags flags) { return flags.sticky(); } + +// TODO: Support /v flag (bug 1713657) +inline bool IsUnicodeSets(RegExpFlags flags) { return false; } +inline bool IsEitherUnicode(RegExpFlags flags) { return flags.unicode(); } + +class Histogram { + public: + inline void AddSample(int sample) {} +}; + +class Counters { + public: + Histogram* regexp_backtracks() { return ®exp_backtracks_; } + + private: + Histogram regexp_backtracks_; +}; + +enum class AllocationType : uint8_t { + kYoung, // Allocate in the nursery + kOld, // Allocate in the tenured heap +}; + +using StackGuard = Isolate; +using Factory = Isolate; + +class Isolate { + public: + Isolate(JSContext* cx) : cx_(cx) {} + ~Isolate(); + bool init(); + + size_t sizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf) const; + + //********** Isolate code **********// + RegExpStack* regexp_stack() const { return regexpStack_; } + + // This is called from inside no-GC code. Instead of suppressing GC + // to allocate the error, we return false from Execute and call + // ReportOverRecursed in the caller. + void StackOverflow() {} + +#ifndef V8_INTL_SUPPORT + unibrow::Mapping<unibrow::Ecma262UnCanonicalize>* jsregexp_uncanonicalize() { + return &jsregexp_uncanonicalize_; + } + unibrow::Mapping<unibrow::Ecma262Canonicalize>* + regexp_macro_assembler_canonicalize() { + return ®exp_macro_assembler_canonicalize_; + } + unibrow::Mapping<unibrow::CanonicalizationRange>* jsregexp_canonrange() { + return &jsregexp_canonrange_; + } + + private: + unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize_; + unibrow::Mapping<unibrow::Ecma262Canonicalize> + regexp_macro_assembler_canonicalize_; + unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange_; +#endif // !V8_INTL_SUPPORT + + public: + // An empty stub for telemetry we don't support + void IncreaseTotalRegexpCodeGenerated(Handle<HeapObject> code) {} + + Counters* counters() { return &counters_; } + + //********** Factory code **********// + inline Factory* factory() { return this; } + + Handle<ByteArray> NewByteArray( + int length, AllocationType allocation = AllocationType::kYoung); + + // Allocates a fixed array initialized with undefined values. + Handle<FixedArray> NewFixedArray(int length); + + template <typename T> + Handle<FixedIntegerArray<T>> NewFixedIntegerArray(uint32_t length); + + template <typename Char> + Handle<String> InternalizeString(const base::Vector<const Char>& str); + + //********** Stack guard code **********// + inline StackGuard* stack_guard() { return this; } + + uintptr_t real_climit() { return cx_->stackLimit(JS::StackForSystemCode); } + + // This is called from inside no-GC code. V8 runs the interrupt + // inside the no-GC code and then "manually relocates unhandlified + // references" afterwards. We just return false and let the caller + // handle interrupts. + Object HandleInterrupts() { return Object(JS::BooleanValue(false)); } + + JSContext* cx() const { return cx_; } + + void trace(JSTracer* trc); + + //********** Handle code **********// + + JS::Value* getHandleLocation(const JS::Value& value); + + private: + mozilla::SegmentedVector<JS::Value, 256> handleArena_; + mozilla::SegmentedVector<PseudoHandle<void>, 256> uniquePtrArena_; + + void* allocatePseudoHandle(size_t bytes); + + public: + template <typename T> + PseudoHandle<T> takeOwnership(void* ptr); + template <typename T> + PseudoHandle<T> maybeTakeOwnership(void* ptr); + + uint32_t liveHandles() const { return handleArena_.Length(); } + uint32_t livePseudoHandles() const { return uniquePtrArena_.Length(); } + + private: + void openHandleScope(HandleScope& scope) { + scope.level_ = handleArena_.Length(); + scope.non_gc_level_ = uniquePtrArena_.Length(); + } + void closeHandleScope(size_t prevLevel, size_t prevUniqueLevel) { + size_t currLevel = handleArena_.Length(); + handleArena_.PopLastN(currLevel - prevLevel); + + size_t currUniqueLevel = uniquePtrArena_.Length(); + uniquePtrArena_.PopLastN(currUniqueLevel - prevUniqueLevel); + } + friend class HandleScope; + + JSContext* cx_; + RegExpStack* regexpStack_{}; + Counters counters_{}; +#ifdef DEBUG + public: + uint32_t shouldSimulateInterrupt_ = 0; +#endif +}; + +// Origin: +// https://github.com/v8/v8/blob/50dcf2af54ce27801a71c47c1be1d2c5e36b0dd6/src/execution/isolate.h#L1909-L1931 +class StackLimitCheck { + public: + StackLimitCheck(Isolate* isolate) : cx_(isolate->cx()) {} + + // Use this to check for stack-overflows in C++ code. + bool HasOverflowed() { + js::AutoCheckRecursionLimit recursion(cx_); + bool overflowed = !recursion.checkDontReport(cx_); + if (overflowed && js::SupportDifferentialTesting()) { + // We don't report overrecursion here, but we throw an exception later + // and this still affects differential testing. Mimic ReportOverRecursed + // (the fuzzers check for this particular string). + fprintf(stderr, "ReportOverRecursed called\n"); + } + return overflowed; + } + + // Use this to check for interrupt request in C++ code. + bool InterruptRequested() { + return cx_->hasPendingInterrupt(js::InterruptReason::CallbackUrgent); + } + + // Use this to check for stack-overflow when entering runtime from JS code. + bool JsHasOverflowed() { + js::AutoCheckRecursionLimit recursion(cx_); + return !recursion.checkDontReport(cx_); + } + + private: + JSContext* cx_; +}; + +class ExternalReference { + public: + static const void* TopOfRegexpStack(Isolate* isolate); + static size_t SizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf, + RegExpStack* regexpStack); +}; + +class Code : public HeapObject { + public: + uint8_t* raw_instruction_start() { return inner()->raw(); } + + static Code cast(Object object) { + Code c; + js::gc::Cell* jitCode = object.value().toGCThing(); + MOZ_ASSERT(jitCode->is<js::jit::JitCode>()); + c.setValue(JS::PrivateGCThingValue(jitCode)); + return c; + } + js::jit::JitCode* inner() { + return value().toGCThing()->as<js::jit::JitCode>(); + } +}; + +// Only used in function signature of functions we don't implement +// (NativeRegExpMacroAssembler::CheckStackGuardState) +class InstructionStream {}; + +// Origin: https://github.com/v8/v8/blob/master/src/codegen/label.h +class Label { + public: + Label() : inner_(js::jit::Label()) {} + + js::jit::Label* inner() { return &inner_; } + + void Unuse() { inner_.reset(); } + + bool is_linked() { return inner_.used(); } + bool is_bound() { return inner_.bound(); } + bool is_unused() { return !inner_.used() && !inner_.bound(); } + + int pos() { return inner_.offset(); } + void link_to(int pos) { inner_.use(pos); } + void bind_to(int pos) { inner_.bind(pos); } + + private: + js::jit::Label inner_; + js::jit::CodeOffset patchOffset_; + + friend class SMRegExpMacroAssembler; +}; + +#define v8_flags js::jit::JitOptions + +#define V8_USE_COMPUTED_GOTO 1 +#define COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + +} // namespace internal +} // namespace v8 + +namespace V8 { + +inline void FatalProcessOutOfMemory(v8::internal::Isolate* isolate, + const char* msg) { + js::AutoEnterOOMUnsafeRegion oomUnsafe; + oomUnsafe.crash(msg); +} + +} // namespace V8 + +#endif // RegexpShim_h |