diff options
Diffstat (limited to 'xpcom/string/RustRegex.h')
-rw-r--r-- | xpcom/string/RustRegex.h | 707 |
1 files changed, 707 insertions, 0 deletions
diff --git a/xpcom/string/RustRegex.h b/xpcom/string/RustRegex.h new file mode 100644 index 0000000000..80b8140bb5 --- /dev/null +++ b/xpcom/string/RustRegex.h @@ -0,0 +1,707 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_RustRegex_h +#define mozilla_RustRegex_h + +#include "nsPrintfCString.h" +#include "nsTArray.h" +#include "rure.h" +#include "mozilla/Maybe.h" +#include "mozilla/UniquePtr.h" + +namespace mozilla { + +// This header is a thin wrapper around the `rure.h` header file, which declares +// the C API for interacting with the rust `regex` crate. This is intended to +// make the type more ergonomic to use with mozilla types. + +class RustRegex; +class RustRegexSet; +class RustRegexOptions; +class RustRegexCaptures; +class RustRegexIter; +class RustRegexIterCaptureNames; + +using RustRegexMatch = rure_match; + +/* + * RustRegexCaptures represents storage for sub-capture locations of a match. + * + * Computing the capture groups of a match can carry a significant performance + * penalty, so their use in the API is optional. + * + * A RustRegexCaptures value may outlive its corresponding RustRegex and can be + * freed independently. + * + * It is not safe to use from multiple threads simultaneously. + */ +class RustRegexCaptures final { + public: + RustRegexCaptures() = default; + + // Check if the `RustRegexCaptures` object is valid. + bool IsValid() const { return mPtr != nullptr; } + explicit operator bool() const { return IsValid(); } + + /* + * CaptureAt returns Some if and only if the capturing group at the + * index given was part of the match. If so, the returned RustRegexMatch + * object contains the start and end offsets (in bytes) of the match. + * + * If no capture group with the index aIdx exists, or the group was not part + * of the match, then Nothing is returned. (A capturing group exists if and + * only if aIdx is less than Length().) + * + * Note that index 0 corresponds to the full match. + */ + Maybe<RustRegexMatch> CaptureAt(size_t aIdx) const { + RustRegexMatch match; + if (mPtr && rure_captures_at(mPtr.get(), aIdx, &match)) { + return Some(match); + } + return Nothing(); + } + Maybe<RustRegexMatch> operator[](size_t aIdx) const { + return CaptureAt(aIdx); + } + + /* + * Returns the number of capturing groups in this `RustRegexCaptures`. + */ + size_t Length() const { return mPtr ? rure_captures_len(mPtr.get()) : 0; } + + private: + friend class RustRegex; + friend class RustRegexIter; + + explicit RustRegexCaptures(rure* aRe) + : mPtr(aRe ? rure_captures_new(aRe) : nullptr) {} + + struct Deleter { + void operator()(rure_captures* ptr) const { rure_captures_free(ptr); } + }; + UniquePtr<rure_captures, Deleter> mPtr; +}; + +/* + * RustRegexIterCaptureNames is an iterator over the list of capture group names + * in this particular RustRegex. + * + * A RustRegexIterCaptureNames value may not outlive its corresponding + * RustRegex, and should be destroyed before its corresponding RustRegex is + * destroyed. + * + * It is not safe to use from multiple threads simultaneously. + */ +class RustRegexIterCaptureNames { + public: + RustRegexIterCaptureNames() = delete; + + // Check if the `RustRegexIterCaptureNames` object is valid. + bool IsValid() const { return mPtr != nullptr; } + explicit operator bool() const { return IsValid(); } + + /* + * Advances the iterator and returns true if and only if another capture group + * name exists. + * + * The value of the capture group name is written to the provided pointer. + */ + mozilla::Maybe<const char*> Next() { + char* next = nullptr; + if (mPtr && rure_iter_capture_names_next(mPtr.get(), &next)) { + return Some(next); + } + return Nothing(); + } + + private: + friend class RustRegex; + + explicit RustRegexIterCaptureNames(rure* aRe) + : mPtr(aRe ? rure_iter_capture_names_new(aRe) : nullptr) {} + + struct Deleter { + void operator()(rure_iter_capture_names* ptr) const { + rure_iter_capture_names_free(ptr); + } + }; + UniquePtr<rure_iter_capture_names, Deleter> mPtr; +}; + +/* + * RustRegexIter is an iterator over successive non-overlapping matches in a + * particular haystack. + * + * A RustRegexIter value may not outlive its corresponding RustRegex and should + * be destroyed before its corresponding RustRegex is destroyed. + * + * It is not safe to use from multiple threads simultaneously. + */ +class RustRegexIter { + public: + RustRegexIter() = delete; + + // Check if the `RustRegexIter` object is valid. + bool IsValid() const { return mPtr != nullptr; } + explicit operator bool() const { return IsValid(); } + + /* + * Next() returns Some if and only if this regex matches anywhere in haystack. + * The returned RustRegexMatch object contains the start and end offsets (in + * bytes) of the match. + * + * If no match is found, then subsequent calls will return Nothing() + * indefinitely. + * + * Next() should be preferred to NextCaptures() since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + mozilla::Maybe<RustRegexMatch> Next() { + RustRegexMatch match{}; + if (mPtr && + rure_iter_next(mPtr.get(), mHaystackPtr, mHaystackSize, &match)) { + return Some(match); + } + return Nothing(); + } + + /* + * NextCaptures returns a valid RustRegexCaptures if and only if this regex + * matches anywhere in haystack. If a match is found, then all of its capture + * locations are stored in the returned RustRegexCaptures object. + * + * If no match is found, then subsequent calls will return an invalid + * `RustRegexCaptures` indefinitely. + * + * Only use this function if you specifically need access to capture + * locations. It is not necessary to use this function just because your + * regular expression contains capturing groups. + * + * Capture locations can be accessed using the methods on RustRegexCaptures. + * + * N.B. The performance of this search can be impacted by the number of + * capturing groups. If you're using this function, it may be beneficial to + * use non-capturing groups (e.g., `(?:re)`) where possible. + */ + RustRegexCaptures NextCaptures() { + RustRegexCaptures captures(mRe); + if (mPtr && rure_iter_next_captures(mPtr.get(), mHaystackPtr, mHaystackSize, + captures.mPtr.get())) { + return captures; + } + return {}; + } + + private: + friend class RustRegex; + RustRegexIter(rure* aRe, const std::string_view& aHaystack) + : mRe(aRe), + mHaystackPtr(reinterpret_cast<const uint8_t*>(aHaystack.data())), + mHaystackSize(aHaystack.size()), + mPtr(aRe ? rure_iter_new(aRe) : nullptr) {} + + rure* MOZ_NON_OWNING_REF mRe; + const uint8_t* MOZ_NON_OWNING_REF mHaystackPtr; + size_t mHaystackSize; + + struct Deleter { + void operator()(rure_iter* ptr) const { rure_iter_free(ptr); } + }; + UniquePtr<rure_iter, Deleter> mPtr; +}; + +/* + * RustRegexOptions is the set of configuration options for compiling a regular + * expression. + * + * All flags on this type can be used to set default flags while compiling, and + * can be toggled in the expression itself using standard syntax, e.g. `(?i)` + * turns case-insensitive matching on, and `(?-i)` disables it. + * + * In addition, two non-flag options are available: setting the size limit of + * the compiled program and setting the size limit of the cache of states that + * the DFA uses while searching. + * + * For most uses, the default settings will work fine, and a default-constructed + * RustRegexOptions can be passed. + */ +class RustRegexOptions { + public: + RustRegexOptions() = default; + + /* + * Set the value for the case insensitive (i) flag. + * + * When enabled, letters in the pattern will match both upper case and lower + * case variants. + */ + RustRegexOptions& CaseInsensitive(bool aYes) { + return SetFlag(aYes, RURE_FLAG_CASEI); + } + + /* + * Set the value for the multi-line matching (m) flag. + * + * When enabled, ^ matches the beginning of lines and $ matches the end of + * lines. + * + * By default, they match beginning/end of the input. + */ + RustRegexOptions& MultiLine(bool aYes) { + return SetFlag(aYes, RURE_FLAG_MULTI); + } + + /* + * Set the value for the any character (s) flag, where in . matches anything + * when s is set and matches anything except for new line when it is not set + * (the default). + * + * N.B. “matches anything” means “any byte” when Unicode is disabled and means + * “any valid UTF-8 encoding of any Unicode scalar value” when Unicode is + * enabled. + */ + RustRegexOptions& DotMatchesNewLine(bool aYes) { + return SetFlag(aYes, RURE_FLAG_DOTNL); + } + + /* + * Set the value for the greedy swap (U) flag. + * + * When enabled, a pattern like a* is lazy (tries to find shortest match) and + * a*? is greedy (tries to find longest match). + * + * By default, a* is greedy and a*? is lazy. + */ + RustRegexOptions& SwapGreed(bool aYes) { + return SetFlag(aYes, RURE_FLAG_SWAP_GREED); + } + + /* + * Set the value for the ignore whitespace (x) flag. + * + * When enabled, whitespace such as new lines and spaces will be ignored + * between expressions of the pattern, and # can be used to start a comment + * until the next new line. + */ + RustRegexOptions& IgnoreWhitespace(bool aYes) { + return SetFlag(aYes, RURE_FLAG_SPACE); + } + + /* + * Set the value for the Unicode (u) flag. + * + * Enabled by default. When disabled, character classes such as \w only match + * ASCII word characters instead of all Unicode word characters. + */ + RustRegexOptions& Unicode(bool aYes) { + return SetFlag(aYes, RURE_FLAG_UNICODE); + } + + /* + * SizeLimit sets the appoximate size limit of the compiled regular + * expression. + * + * This size limit roughly corresponds to the number of bytes occupied by + * a single compiled program. If the program would exceed this number, + * then an invalid RustRegex will be constructed. + */ + RustRegexOptions& SizeLimit(size_t aLimit) { + mSizeLimit = Some(aLimit); + return *this; + } + + /* + * DFASizeLimit sets the approximate size of the cache used by the DFA during + * search. + * + * This roughly corresponds to the number of bytes that the DFA will use while + * searching. + * + * Note that this is a *per thread* limit. There is no way to set a global + * limit. In particular, if a regular expression is used from multiple threads + * simultaneously, then each thread may use up to the number of bytes + * specified here. + */ + RustRegexOptions& DFASizeLimit(size_t aLimit) { + mDFASizeLimit = Some(aLimit); + return *this; + } + + private: + friend class RustRegex; + friend class RustRegexSet; + + struct OptionsDeleter { + void operator()(rure_options* ptr) const { rure_options_free(ptr); } + }; + + UniquePtr<rure_options, OptionsDeleter> GetOptions() const { + UniquePtr<rure_options, OptionsDeleter> options; + if (mSizeLimit || mDFASizeLimit) { + options.reset(rure_options_new()); + if (mSizeLimit) { + rure_options_size_limit(options.get(), *mSizeLimit); + } + if (mDFASizeLimit) { + rure_options_dfa_size_limit(options.get(), *mDFASizeLimit); + } + } + return options; + } + + uint32_t GetFlags() const { return mFlags; } + + RustRegexOptions& SetFlag(bool aYes, uint32_t aFlag) { + if (aYes) { + mFlags |= aFlag; + } else { + mFlags &= ~aFlag; + } + return *this; + } + + uint32_t mFlags = RURE_DEFAULT_FLAGS; + Maybe<size_t> mSizeLimit; + Maybe<size_t> mDFASizeLimit; +}; + +/* + * RustRegex is the type of a compiled regular expression. + * + * A RustRegex can be safely used from multiple threads simultaneously. + * + * When calling the matching methods on this type, they will generally have the + * following parameters: + * + * aHaystack + * may contain arbitrary bytes, but ASCII compatible text is more useful. + * UTF-8 is even more useful. Other text encodings aren't supported. + * + * aStart + * the position in bytes at which to start searching. Note that setting the + * start position is distinct from using a substring for `aHaystack`, since + * the regex engine may look at bytes before the start position to determine + * match information. For example, if the start position is greater than 0, + * then the \A ("begin text") anchor can never match. + */ +class RustRegex final { + public: + // Create a new invalid RustRegex object + RustRegex() = default; + + /* + * Compiles the given pattern into a regular expression. The pattern must be + * valid UTF-8 and the length corresponds to the number of bytes in the + * pattern. + * + * If an error occurs, the constructed RustRegex will be `!IsValid()`. + * + * The compiled expression returned may be used from multiple threads + * simultaneously. + */ + explicit RustRegex(const std::string_view& aPattern, + const RustRegexOptions& aOptions = {}) { +#ifdef DEBUG + rure_error* error = rure_error_new(); +#else + rure_error* error = nullptr; +#endif + mPtr.reset(rure_compile(reinterpret_cast<const uint8_t*>(aPattern.data()), + aPattern.size(), aOptions.GetFlags(), + aOptions.GetOptions().get(), error)); +#ifdef DEBUG + if (!mPtr) { + NS_WARNING(nsPrintfCString("RustRegex compile failed: %s", + rure_error_message(error)) + .get()); + } + rure_error_free(error); +#endif + } + + // Check if the compiled `RustRegex` is valid. + bool IsValid() const { return mPtr != nullptr; } + explicit operator bool() const { return IsValid(); } + + /* + * IsMatch returns true if and only if this regex matches anywhere in + * aHaystack. + * + * See the type-level comment for details on aHaystack and aStart. + * + * IsMatch() should be preferred to Find() since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + bool IsMatch(const std::string_view& aHaystack, size_t aStart = 0) const { + return mPtr && + rure_is_match(mPtr.get(), + reinterpret_cast<const uint8_t*>(aHaystack.data()), + aHaystack.size(), aStart); + } + + /* + * Find returns Some if and only if this regex matches anywhere in + * haystack. The returned RustRegexMatch object contains the start and end + * offsets (in bytes) of the match. + * + * See the type-level comment for details on aHaystack and aStart. + * + * Find() should be preferred to FindCaptures() since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + Maybe<RustRegexMatch> Find(const std::string_view& aHaystack, + size_t aStart = 0) const { + RustRegexMatch match{}; + if (mPtr && rure_find(mPtr.get(), + reinterpret_cast<const uint8_t*>(aHaystack.data()), + aHaystack.size(), aStart, &match)) { + return Some(match); + } + return Nothing(); + } + + /* + * FindCaptures() returns a valid RustRegexCaptures if and only if this + * regex matches anywhere in haystack. If a match is found, then all of its + * capture locations are stored in the returned RustRegexCaptures object. + * + * See the type-level comment for details on aHaystack and aStart. + * + * Only use this function if you specifically need access to capture + * locations. It is not necessary to use this function just because your + * regular expression contains capturing groups. + * + * Capture locations can be accessed using the methods on RustRegexCaptures. + * + * N.B. The performance of this search can be impacted by the number of + * capturing groups. If you're using this function, it may be beneficial to + * use non-capturing groups (e.g., `(?:re)`) where possible. + */ + RustRegexCaptures FindCaptures(const std::string_view& aHaystack, + size_t aStart = 0) const { + RustRegexCaptures captures(mPtr.get()); + if (mPtr && + rure_find_captures(mPtr.get(), + reinterpret_cast<const uint8_t*>(aHaystack.data()), + aHaystack.size(), aStart, captures.mPtr.get())) { + return captures; + } + return {}; + } + + /* + * ShortestMatch() returns Some if and only if this regex matches anywhere + * in haystack. If a match is found, then its end location is stored in the + * pointer given. The end location is the place at which the regex engine + * determined that a match exists, but may occur before the end of the + * proper leftmost-first match. + * + * See the type-level comment for details on aHaystack and aStart. + * + * ShortestMatch should be preferred to Find since it may be faster. + * + * N.B. The performance of this search is not impacted by the presence of + * capturing groups in your regular expression. + */ + Maybe<size_t> ShortestMatch(const std::string_view& aHaystack, + size_t aStart = 0) const { + size_t end = 0; + if (mPtr && + rure_shortest_match(mPtr.get(), + reinterpret_cast<const uint8_t*>(aHaystack.data()), + aHaystack.size(), aStart, &end)) { + return Some(end); + } + return Nothing(); + } + + /* + * Create an iterator over all successive non-overlapping matches of this + * regex in aHaystack. + * + * See the type-level comment for details on aHaystack. + * + * Both aHaystack and this regex must remain valid until the returned + * `RustRegexIter` is destroyed. + */ + RustRegexIter IterMatches(const std::string_view& aHaystack) const { + return RustRegexIter(mPtr.get(), aHaystack); + } + + /* + * Returns the capture index for the name given. If no such named capturing + * group exists in this regex, then -1 is returned. + * + * The capture index may be used with RustRegexCaptures::CaptureAt. + * + * This function never returns 0 since the first capture group always + * corresponds to the entire match and is always unnamed. + */ + int32_t CaptureNameIndex(const char* aName) const { + return mPtr ? rure_capture_name_index(mPtr.get(), aName) : -1; + } + + /* + * Create an iterator over the list of capture group names in this particular + * regex. + * + * This regex must remain valid until the returned `RustRegexIterCaptureNames` + * is destroyed. + */ + RustRegexIterCaptureNames IterCaptureNames() const { + return RustRegexIterCaptureNames(mPtr.get()); + } + + /* + * Count the number of successive non-overlapping matches of this regex in + * aHaystack. + * + * See the type-level comment for details on aHaystack. + */ + size_t CountMatches(const std::string_view& aHaystack) const { + size_t count = 0; + auto iter = IterMatches(aHaystack); + while (iter.Next()) { + count++; + } + return count; + } + + private: + struct Deleter { + void operator()(rure* ptr) const { rure_free(ptr); } + }; + UniquePtr<rure, Deleter> mPtr; +}; + +/* + * RustRegexSet is the type of a set of compiled regular expression. + * + * A RustRegexSet can be safely used from multiple threads simultaneously. + * + * When calling the matching methods on this type, they will generally have the + * following parameters: + * + * aHaystack + * may contain arbitrary bytes, but ASCII compatible text is more useful. + * UTF-8 is even more useful. Other text encodings aren't supported. + * + * aStart + * the position in bytes at which to start searching. Note that setting the + * start position is distinct from using a substring for `aHaystack`, since + * the regex engine may look at bytes before the start position to determine + * match information. For example, if the start position is greater than 0, + * then the \A ("begin text") anchor can never match. + */ +class RustRegexSet final { + public: + /* + * Compiles the given range of patterns into a single regular expression which + * can be matched in a linear-scan. Each pattern in aPatterns must be valid + * UTF-8, and implicitly coerce to `std::string_view`. + * + * If an error occurs, the constructed RustRegexSet will be `!IsValid()`. + * + * The compiled expression returned may be used from multiple threads + * simultaneously. + */ + template <typename Patterns> + explicit RustRegexSet(Patterns&& aPatterns, + const RustRegexOptions& aOptions = {}) { +#ifdef DEBUG + rure_error* error = rure_error_new(); +#else + rure_error* error = nullptr; +#endif + AutoTArray<const uint8_t*, 4> patternPtrs; + AutoTArray<size_t, 4> patternSizes; + for (auto&& pattern : std::forward<Patterns>(aPatterns)) { + std::string_view view = pattern; + patternPtrs.AppendElement(reinterpret_cast<const uint8_t*>(view.data())); + patternSizes.AppendElement(view.size()); + } + mPtr.reset(rure_compile_set(patternPtrs.Elements(), patternSizes.Elements(), + patternPtrs.Length(), aOptions.GetFlags(), + aOptions.GetOptions().get(), error)); +#ifdef DEBUG + if (!mPtr) { + NS_WARNING(nsPrintfCString("RustRegexSet compile failed: %s", + rure_error_message(error)) + .get()); + } + rure_error_free(error); +#endif + } + + // Check if the `RustRegexSet` object is valid. + bool IsValid() const { return mPtr != nullptr; } + explicit operator bool() const { return IsValid(); } + + /* + * IsMatch returns true if and only if any regexes within the set + * match anywhere in the haystack. Once a match has been located, the + * matching engine will quit immediately. + * + * See the type-level comment for details on aHaystack and aStart. + */ + bool IsMatch(const std::string_view& aHaystack, size_t aStart = 0) const { + return mPtr && + rure_set_is_match(mPtr.get(), + reinterpret_cast<const uint8_t*>(aHaystack.data()), + aHaystack.size(), aStart); + } + + struct SetMatches { + bool matchedAny = false; + nsTArray<bool> matches; + }; + + /* + * Matches() compares each regex in the set against the haystack and + * returns a list with the match result of each pattern. Match results are + * ordered in the same way as the regex set was compiled. For example, index 0 + * of matches corresponds to the first pattern passed to the constructor. + * + * See the type-level comment for details on aHaystack and aStart. + * + * Only use this function if you specifically need to know which regexes + * matched within the set. To determine if any of the regexes matched without + * caring which, use IsMatch. + */ + SetMatches Matches(const std::string_view& aHaystack, + size_t aStart = 0) const { + nsTArray<bool> matches; + matches.SetLength(Length()); + bool any = mPtr && rure_set_matches( + mPtr.get(), + reinterpret_cast<const uint8_t*>(aHaystack.data()), + aHaystack.size(), aStart, matches.Elements()); + return SetMatches{any, std::move(matches)}; + } + + /* + * Returns the number of patterns the regex set was compiled with. + */ + size_t Length() const { return mPtr ? rure_set_len(mPtr.get()) : 0; } + + private: + struct Deleter { + void operator()(rure_set* ptr) const { rure_set_free(ptr); } + }; + UniquePtr<rure_set, Deleter> mPtr; +}; + +} // namespace mozilla + +#endif // mozilla_RustRegex_h |