// Copyright 2019 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_REGEXP_REGEXP_COMPILER_H_ #define V8_REGEXP_REGEXP_COMPILER_H_ #include #include "irregexp/imported/regexp-nodes.h" namespace v8 { namespace internal { class DynamicBitSet; class Isolate; namespace regexp_compiler_constants { // The '2' variant is has inclusive from and exclusive to. // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, // which include WhiteSpace (7.2) or LineTerminator (7.3) values. constexpr uc32 kRangeEndMarker = 0x110000; constexpr int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; constexpr int kSpaceRangeCount = arraysize(kSpaceRanges); constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; constexpr int kWordRangeCount = arraysize(kWordRanges); constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; constexpr int kDigitRangeCount = arraysize(kDigitRanges); constexpr int kSurrogateRanges[] = {kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker}; constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); // More makes code generation slower, less makes V8 benchmark score lower. constexpr int kMaxLookaheadForBoyerMoore = 8; // In a 3-character pattern you can maximally step forwards 3 characters // at a time, which is not always enough to pay for the extra logic. constexpr int kPatternTooShortForBoyerMoore = 2; } // namespace regexp_compiler_constants inline bool IgnoreCase(JSRegExp::Flags flags) { return (flags & JSRegExp::kIgnoreCase) != 0; } inline bool IsUnicode(JSRegExp::Flags flags) { return (flags & JSRegExp::kUnicode) != 0; } inline bool IsSticky(JSRegExp::Flags flags) { return (flags & JSRegExp::kSticky) != 0; } inline bool IsGlobal(JSRegExp::Flags flags) { return (flags & JSRegExp::kGlobal) != 0; } inline bool DotAll(JSRegExp::Flags flags) { return (flags & JSRegExp::kDotAll) != 0; } inline bool Multiline(JSRegExp::Flags flags) { return (flags & JSRegExp::kMultiline) != 0; } inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { // Both unicode and ignore_case flags are set. We need to use ICU to find // the closure over case equivalents. return IsUnicode(flags) && IgnoreCase(flags); } // Details of a quick mask-compare check that can look ahead in the // input stream. class QuickCheckDetails { public: QuickCheckDetails() : characters_(0), mask_(0), value_(0), cannot_match_(false) {} explicit QuickCheckDetails(int characters) : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} bool Rationalize(bool one_byte); // Merge in the information from another branch of an alternation. void Merge(QuickCheckDetails* other, int from_index); // Advance the current position by some amount. void Advance(int by, bool one_byte); void Clear(); bool cannot_match() { return cannot_match_; } void set_cannot_match() { cannot_match_ = true; } struct Position { Position() : mask(0), value(0), determines_perfectly(false) {} uc16 mask; uc16 value; bool determines_perfectly; }; int characters() { return characters_; } void set_characters(int characters) { characters_ = characters; } Position* positions(int index) { DCHECK_LE(0, index); DCHECK_GT(characters_, index); return positions_ + index; } uint32_t mask() { return mask_; } uint32_t value() { return value_; } private: // How many characters do we have quick check information from. This is // the same for all branches of a choice node. int characters_; Position positions_[4]; // These values are the condensate of the above array after Rationalize(). uint32_t mask_; uint32_t value_; // If set to true, there is no way this quick check can match at all. // E.g., if it requires to be at the start of the input, and isn't. bool cannot_match_; }; // Improve the speed that we scan for an initial point where a non-anchored // regexp can match by using a Boyer-Moore-like table. This is done by // identifying non-greedy non-capturing loops in the nodes that eat any // character one at a time. For example in the middle of the regexp // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly // inserted at the start of any non-anchored regexp. // // When we have found such a loop we look ahead in the nodes to find the set of // characters that can come at given distances. For example for the regexp // /.?foo/ we know that there are at least 3 characters ahead of us, and the // sets of characters that can occur are [any, [f, o], [o]]. We find a range in // the lookahead info where the set of characters is reasonably constrained. In // our example this is from index 1 to 2 (0 is not constrained). We can now // look 3 characters ahead and if we don't find one of [f, o] (the union of // [f, o] and [o]) then we can skip forwards by the range size (in this case 2). // // For Unicode input strings we do the same, but modulo 128. // // We also look at the first string fed to the regexp and use that to get a hint // of the character frequencies in the inputs. This affects the assessment of // whether the set of characters is 'reasonably constrained'. // // We also have another lookahead mechanism (called quick check in the code), // which uses a wide load of multiple characters followed by a mask and compare // to determine whether a match is possible at this point. enum ContainedInLattice { kNotYet = 0, kLatticeIn = 1, kLatticeOut = 2, kLatticeUnknown = 3 // Can also mean both in and out. }; inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { return static_cast(a | b); } class BoyerMoorePositionInfo : public ZoneObject { public: bool at(int i) const { return map_[i]; } static constexpr int kMapSize = 128; static constexpr int kMask = kMapSize - 1; int map_count() const { return map_count_; } void Set(int character); void SetInterval(const Interval& interval); void SetAll(); bool is_non_word() { return w_ == kLatticeOut; } bool is_word() { return w_ == kLatticeIn; } using Bitset = std::bitset; Bitset raw_bitset() const { return map_; } private: Bitset map_; int map_count_ = 0; // Number of set bits in the map. ContainedInLattice w_ = kNotYet; // The \w character class. }; class BoyerMooreLookahead : public ZoneObject { public: BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); int length() { return length_; } int max_char() { return max_char_; } RegExpCompiler* compiler() { return compiler_; } int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } void Set(int map_number, int character) { if (character > max_char_) return; BoyerMoorePositionInfo* info = bitmaps_->at(map_number); info->Set(character); } void SetInterval(int map_number, const Interval& interval) { if (interval.from() > max_char_) return; BoyerMoorePositionInfo* info = bitmaps_->at(map_number); if (interval.to() > max_char_) { info->SetInterval(Interval(interval.from(), max_char_)); } else { info->SetInterval(interval); } } void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } void SetRest(int from_map) { for (int i = from_map; i < length_; i++) SetAll(i); } void EmitSkipInstructions(RegExpMacroAssembler* masm); private: // This is the value obtained by EatsAtLeast. If we do not have at least this // many characters left in the sample string then the match is bound to fail. // Therefore it is OK to read a character this far ahead of the current match // point. int length_; RegExpCompiler* compiler_; // 0xff for Latin1, 0xffff for UTF-16. int max_char_; ZoneList* bitmaps_; int GetSkipTable(int min_lookahead, int max_lookahead, Handle boolean_skip_table); bool FindWorthwhileInterval(int* from, int* to); int FindBestInterval(int max_number_of_chars, int old_biggest_points, int* from, int* to); }; // There are many ways to generate code for a node. This class encapsulates // the current way we should be generating. In other words it encapsulates // the current state of the code generator. The effect of this is that we // generate code for paths that the matcher can take through the regular // expression. A given node in the regexp can be code-generated several times // as it can be part of several traces. For example for the regexp: // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code // to match foo is generated only once (the traces have a common prefix). The // code to store the capture is deferred and generated (twice) after the places // where baz has been matched. class Trace { public: // A value for a property that is either known to be true, know to be false, // or not known. enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; class DeferredAction { public: DeferredAction(ActionNode::ActionType action_type, int reg) : action_type_(action_type), reg_(reg), next_(nullptr) {} DeferredAction* next() { return next_; } bool Mentions(int reg); int reg() { return reg_; } ActionNode::ActionType action_type() { return action_type_; } private: ActionNode::ActionType action_type_; int reg_; DeferredAction* next_; friend class Trace; }; class DeferredCapture : public DeferredAction { public: DeferredCapture(int reg, bool is_capture, Trace* trace) : DeferredAction(ActionNode::STORE_POSITION, reg), cp_offset_(trace->cp_offset()), is_capture_(is_capture) {} int cp_offset() { return cp_offset_; } bool is_capture() { return is_capture_; } private: int cp_offset_; bool is_capture_; void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } }; class DeferredSetRegisterForLoop : public DeferredAction { public: DeferredSetRegisterForLoop(int reg, int value) : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg), value_(value) {} int value() { return value_; } private: int value_; }; class DeferredClearCaptures : public DeferredAction { public: explicit DeferredClearCaptures(Interval range) : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {} Interval range() { return range_; } private: Interval range_; }; class DeferredIncrementRegister : public DeferredAction { public: explicit DeferredIncrementRegister(int reg) : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {} }; Trace() : cp_offset_(0), actions_(nullptr), backtrack_(nullptr), stop_node_(nullptr), loop_label_(nullptr), characters_preloaded_(0), bound_checked_up_to_(0), flush_budget_(100), at_start_(UNKNOWN) {} // End the trace. This involves flushing the deferred actions in the trace // and pushing a backtrack location onto the backtrack stack. Once this is // done we can start a new trace or go to one that has already been // generated. void Flush(RegExpCompiler* compiler, RegExpNode* successor); int cp_offset() { return cp_offset_; } DeferredAction* actions() { return actions_; } // A trivial trace is one that has no deferred actions or other state that // affects the assumptions used when generating code. There is no recorded // backtrack location in a trivial trace, so with a trivial trace we will // generate code that, on a failure to match, gets the backtrack location // from the backtrack stack rather than using a direct jump instruction. We // always start code generation with a trivial trace and non-trivial traces // are created as we emit code for nodes or add to the list of deferred // actions in the trace. The location of the code generated for a node using // a trivial trace is recorded in a label in the node so that gotos can be // generated to that code. bool is_trivial() { return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; } TriBool at_start() { return at_start_; } void set_at_start(TriBool at_start) { at_start_ = at_start; } Label* backtrack() { return backtrack_; } Label* loop_label() { return loop_label_; } RegExpNode* stop_node() { return stop_node_; } int characters_preloaded() { return characters_preloaded_; } int bound_checked_up_to() { return bound_checked_up_to_; } int flush_budget() { return flush_budget_; } QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } bool mentions_reg(int reg); // Returns true if a deferred position store exists to the specified // register and stores the offset in the out-parameter. Otherwise // returns false. bool GetStoredPosition(int reg, int* cp_offset); // These set methods and AdvanceCurrentPositionInTrace should be used only on // new traces - the intention is that traces are immutable after creation. void add_action(DeferredAction* new_action) { DCHECK(new_action->next_ == nullptr); new_action->next_ = actions_; actions_ = new_action; } void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } void set_stop_node(RegExpNode* node) { stop_node_ = node; } void set_loop_label(Label* label) { loop_label_ = label; } void set_characters_preloaded(int count) { characters_preloaded_ = count; } void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } void set_flush_budget(int to) { flush_budget_ = to; } void set_quick_check_performed(QuickCheckDetails* d) { quick_check_performed_ = *d; } void InvalidateCurrentCharacter(); void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); private: int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, const DynamicBitSet& affected_registers, DynamicBitSet* registers_to_pop, DynamicBitSet* registers_to_clear, Zone* zone); void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, const DynamicBitSet& registers_to_pop, const DynamicBitSet& registers_to_clear); int cp_offset_; DeferredAction* actions_; Label* backtrack_; RegExpNode* stop_node_; Label* loop_label_; int characters_preloaded_; int bound_checked_up_to_; QuickCheckDetails quick_check_performed_; int flush_budget_; TriBool at_start_; }; class GreedyLoopState { public: explicit GreedyLoopState(bool not_at_start); Label* label() { return &label_; } Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } private: Label label_; Trace counter_backtrack_trace_; }; struct PreloadState { static const int kEatsAtLeastNotYetInitialized = -1; bool preload_is_current_; bool preload_has_checked_bounds_; int preload_characters_; int eats_at_least_; void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } }; // Analysis performs assertion propagation and computes eats_at_least_ values. // See the comments on AssertionPropagator and EatsAtLeastPropagator for more // details. RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); class FrequencyCollator { public: FrequencyCollator() : total_samples_(0) { for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { frequencies_[i] = CharacterFrequency(i); } } void CountCharacter(int character) { int index = (character & RegExpMacroAssembler::kTableMask); frequencies_[index].Increment(); total_samples_++; } // Does not measure in percent, but rather per-128 (the table size from the // regexp macro assembler). int Frequency(int in_character) { DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); if (total_samples_ < 1) return 1; // Division by zero. int freq_in_per128 = (frequencies_[in_character].counter() * 128) / total_samples_; return freq_in_per128; } private: class CharacterFrequency { public: CharacterFrequency() : counter_(0), character_(-1) {} explicit CharacterFrequency(int character) : counter_(0), character_(character) {} void Increment() { counter_++; } int counter() { return counter_; } int character() { return character_; } private: int counter_; int character_; }; private: CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; int total_samples_; }; class RegExpCompiler { public: RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, bool is_one_byte); int AllocateRegister() { if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { reg_exp_too_big_ = true; return next_register_; } return next_register_++; } // Lookarounds to match lone surrogates for unicode character class matches // are never nested. We can therefore reuse registers. int UnicodeLookaroundStackRegister() { if (unicode_lookaround_stack_register_ == kNoRegister) { unicode_lookaround_stack_register_ = AllocateRegister(); } return unicode_lookaround_stack_register_; } int UnicodeLookaroundPositionRegister() { if (unicode_lookaround_position_register_ == kNoRegister) { unicode_lookaround_position_register_ = AllocateRegister(); } return unicode_lookaround_position_register_; } struct CompilationResult final { explicit CompilationResult(RegExpError err) : error(err) {} CompilationResult(Handle code, int registers) : code(code), num_registers(registers) {} static CompilationResult RegExpTooBig() { return CompilationResult(RegExpError::kTooLarge); } bool Succeeded() const { return error == RegExpError::kNone; } const RegExpError error = RegExpError::kNone; Handle code; int num_registers = 0; }; CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, RegExpNode* start, int capture_count, Handle pattern); // Preprocessing is the final step of node creation before analysis // and assembly. It includes: // - Wrapping the body of the regexp in capture 0. // - Inserting the implicit .* before/after the regexp if necessary. // - If the input is a one-byte string, filtering out nodes that can't match. // - Fixing up regexp matches that start within a surrogate pair. RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags, bool is_one_byte); // If the regexp matching starts within a surrogate pair, step back to the // lead surrogate and start matching from there. RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success, JSRegExp::Flags flags); inline void AddWork(RegExpNode* node) { if (!node->on_work_list() && !node->label()->is_bound()) { node->set_on_work_list(true); work_list_->push_back(node); } } static const int kImplementationOffset = 0; static const int kNumberOfRegistersOffset = 0; static const int kCodeOffset = 1; RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } EndNode* accept() { return accept_; } static const int kMaxRecursion = 100; inline int recursion_depth() { return recursion_depth_; } inline void IncrementRecursionDepth() { recursion_depth_++; } inline void DecrementRecursionDepth() { recursion_depth_--; } void SetRegExpTooBig() { reg_exp_too_big_ = true; } inline bool one_byte() { return one_byte_; } inline bool optimize() { return optimize_; } inline void set_optimize(bool value) { optimize_ = value; } inline bool limiting_recursion() { return limiting_recursion_; } inline void set_limiting_recursion(bool value) { limiting_recursion_ = value; } bool read_backward() { return read_backward_; } void set_read_backward(bool value) { read_backward_ = value; } FrequencyCollator* frequency_collator() { return &frequency_collator_; } int current_expansion_factor() { return current_expansion_factor_; } void set_current_expansion_factor(int value) { current_expansion_factor_ = value; } Isolate* isolate() const { return isolate_; } Zone* zone() const { return zone_; } static const int kNoRegister = -1; private: EndNode* accept_; int next_register_; int unicode_lookaround_stack_register_; int unicode_lookaround_position_register_; ZoneVector* work_list_; int recursion_depth_; RegExpMacroAssembler* macro_assembler_; bool one_byte_; bool reg_exp_too_big_; bool limiting_recursion_; bool optimize_; bool read_backward_; int current_expansion_factor_; FrequencyCollator frequency_collator_; Isolate* isolate_; Zone* zone_; }; // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. class UnicodeRangeSplitter { public: V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList* base); static constexpr int kInitialSize = 8; using CharacterRangeVector = base::SmallVector; const CharacterRangeVector* bmp() const { return &bmp_; } const CharacterRangeVector* lead_surrogates() const { return &lead_surrogates_; } const CharacterRangeVector* trail_surrogates() const { return &trail_surrogates_; } const CharacterRangeVector* non_bmp() const { return &non_bmp_; } private: void AddRange(CharacterRange range); CharacterRangeVector bmp_; CharacterRangeVector lead_surrogates_; CharacterRangeVector trail_surrogates_; CharacterRangeVector non_bmp_; }; // We need to check for the following characters: 0x39C 0x3BC 0x178. // TODO(jgruber): Move to CharacterRange. bool RangeContainsLatin1Equivalents(CharacterRange range); } // namespace internal } // namespace v8 #endif // V8_REGEXP_REGEXP_COMPILER_H_