diff options
Diffstat (limited to 'xpcom/ds/nsCharSeparatedTokenizer.h')
-rw-r--r-- | xpcom/ds/nsCharSeparatedTokenizer.h | 274 |
1 files changed, 274 insertions, 0 deletions
diff --git a/xpcom/ds/nsCharSeparatedTokenizer.h b/xpcom/ds/nsCharSeparatedTokenizer.h new file mode 100644 index 0000000000..5cf6992e3e --- /dev/null +++ b/xpcom/ds/nsCharSeparatedTokenizer.h @@ -0,0 +1,274 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef __nsCharSeparatedTokenizer_h +#define __nsCharSeparatedTokenizer_h + +#include "mozilla/Maybe.h" +#include "mozilla/RangedPtr.h" +#include "mozilla/TypedEnumBits.h" + +#include "nsCRTGlue.h" +#include "nsTDependentSubstring.h" + +// Flags -- only one for now. If we need more, they should be defined to +// be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.) +enum class nsTokenizerFlags { + Default = 0, + SeparatorOptional = 1 << 0, + IncludeEmptyTokenAtEnd = 1 << 1 +}; + +MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(nsTokenizerFlags) + +/** + * This parses a SeparatorChar-separated string into tokens. + * Whitespace surrounding tokens is not treated as part of tokens, however + * whitespace inside a token is. If the final token is the empty string, it is + * not returned by default. + * + * Some examples, with SeparatorChar = ',': + * + * "foo, bar, baz" -> "foo" "bar" "baz" + * "foo,bar,baz" -> "foo" "bar" "baz" + * "foo , bar hi , baz" -> "foo" "bar hi" "baz" + * "foo, ,bar,baz" -> "foo" "" "bar" "baz" + * "foo,,bar,baz" -> "foo" "" "bar" "baz" + * "foo,bar,baz," -> "foo" "bar" "baz" + * + * The function used for whitespace detection is a template argument. + * By default, it is NS_IsAsciiWhitespace. + */ +template <typename TDependentSubstringType, bool IsWhitespace(char16_t), + nsTokenizerFlags Flags = nsTokenizerFlags::Default> +class nsTCharSeparatedTokenizer { + using CharType = typename TDependentSubstringType::char_type; + using SubstringType = typename TDependentSubstringType::substring_type; + + public: + using DependentSubstringType = TDependentSubstringType; + + nsTCharSeparatedTokenizer(const SubstringType& aSource, + CharType aSeparatorChar) + : mIter(aSource.Data(), aSource.Length()), + mEnd(aSource.Data() + aSource.Length(), aSource.Data(), + aSource.Length()), + mSeparatorChar(aSeparatorChar), + mWhitespaceBeforeFirstToken(false), + mWhitespaceAfterCurrentToken(false), + mSeparatorAfterCurrentToken(false) { + // Skip initial whitespace + while (mIter < mEnd && IsWhitespace(*mIter)) { + mWhitespaceBeforeFirstToken = true; + ++mIter; + } + } + + /** + * Checks if any more tokens are available. + */ + bool hasMoreTokens() const { + MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), + "Should be at beginning of token if there is one"); + + if constexpr (Flags & nsTokenizerFlags::IncludeEmptyTokenAtEnd) { + return mIter < mEnd || (mIter == mEnd && mSeparatorAfterCurrentToken); + } else { + return mIter < mEnd; + } + } + + /* + * Returns true if there is whitespace prior to the first token. + */ + bool whitespaceBeforeFirstToken() const { + return mWhitespaceBeforeFirstToken; + } + + /* + * Returns true if there is a separator after the current token. + * Useful if you want to check whether the last token has a separator + * after it which may not be valid. + */ + bool separatorAfterCurrentToken() const { + return mSeparatorAfterCurrentToken; + } + + /* + * Returns true if there is any whitespace after the current token. + */ + bool whitespaceAfterCurrentToken() const { + return mWhitespaceAfterCurrentToken; + } + + /** + * Returns the next token. + */ + const DependentSubstringType nextToken() { + mozilla::RangedPtr<const CharType> tokenStart = mIter; + mozilla::RangedPtr<const CharType> tokenEnd = mIter; + + MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), + "Should be at beginning of token if there is one"); + + // Search until we hit separator or end (or whitespace, if a separator + // isn't required -- see clause with 'break' below). + while (mIter < mEnd && *mIter != mSeparatorChar) { + // Skip to end of the current word. + while (mIter < mEnd && !IsWhitespace(*mIter) && + *mIter != mSeparatorChar) { + ++mIter; + } + tokenEnd = mIter; + + // Skip whitespace after the current word. + mWhitespaceAfterCurrentToken = false; + while (mIter < mEnd && IsWhitespace(*mIter)) { + mWhitespaceAfterCurrentToken = true; + ++mIter; + } + if constexpr (Flags & nsTokenizerFlags::SeparatorOptional) { + // We've hit (and skipped) whitespace, and that's sufficient to end + // our token, regardless of whether we've reached a SeparatorChar. + break; + } // (else, we'll keep looping until we hit mEnd or SeparatorChar) + } + + mSeparatorAfterCurrentToken = (mIter != mEnd && *mIter == mSeparatorChar); + MOZ_ASSERT((Flags & nsTokenizerFlags::SeparatorOptional) || + (mSeparatorAfterCurrentToken == (mIter < mEnd)), + "If we require a separator and haven't hit the end of " + "our string, then we shouldn't have left the loop " + "unless we hit a separator"); + + // Skip separator (and any whitespace after it), if we're at one. + if (mSeparatorAfterCurrentToken) { + ++mIter; + + while (mIter < mEnd && IsWhitespace(*mIter)) { + mWhitespaceAfterCurrentToken = true; + ++mIter; + } + } + + return Substring(tokenStart.get(), tokenEnd.get()); + } + + auto ToRange() const; + + private: + mozilla::RangedPtr<const CharType> mIter; + const mozilla::RangedPtr<const CharType> mEnd; + const CharType mSeparatorChar; + bool mWhitespaceBeforeFirstToken; + bool mWhitespaceAfterCurrentToken; + bool mSeparatorAfterCurrentToken; +}; + +constexpr bool NS_TokenizerIgnoreNothing(char16_t) { return false; } + +template <bool IsWhitespace(char16_t), typename CharType, + nsTokenizerFlags Flags = nsTokenizerFlags::Default> +using nsTCharSeparatedTokenizerTemplate = + nsTCharSeparatedTokenizer<nsTDependentSubstring<CharType>, IsWhitespace, + Flags>; + +template <bool IsWhitespace(char16_t), + nsTokenizerFlags Flags = nsTokenizerFlags::Default> +using nsCharSeparatedTokenizerTemplate = + nsTCharSeparatedTokenizerTemplate<IsWhitespace, char16_t, Flags>; + +using nsCharSeparatedTokenizer = + nsCharSeparatedTokenizerTemplate<NS_IsAsciiWhitespace>; + +template <bool IsWhitespace(char16_t), + nsTokenizerFlags Flags = nsTokenizerFlags::Default> +using nsCCharSeparatedTokenizerTemplate = + nsTCharSeparatedTokenizerTemplate<IsWhitespace, char, Flags>; + +using nsCCharSeparatedTokenizer = + nsCCharSeparatedTokenizerTemplate<NS_IsAsciiWhitespace>; + +/** + * Adapts a char separated tokenizer for use in a range-based for loop. + * + * Use this typically only indirectly, e.g. like + * + * for (const auto& token : nsCharSeparatedTokenizer(aText, ' ').ToRange()) { + * // ... + * } + */ +template <typename Tokenizer> +class nsTokenizedRange { + public: + using DependentSubstringType = typename Tokenizer::DependentSubstringType; + + explicit nsTokenizedRange(Tokenizer&& aTokenizer) + : mTokenizer(std::move(aTokenizer)) {} + + struct EndSentinel {}; + struct Iterator { + explicit Iterator(const Tokenizer& aTokenizer) : mTokenizer(aTokenizer) { + Next(); + } + + const DependentSubstringType& operator*() const { return *mCurrentToken; } + + Iterator& operator++() { + Next(); + return *this; + } + + bool operator==(const EndSentinel&) const { + return mCurrentToken.isNothing(); + } + + bool operator!=(const EndSentinel&) const { return mCurrentToken.isSome(); } + + private: + void Next() { + mCurrentToken.reset(); + + if (mTokenizer.hasMoreTokens()) { + mCurrentToken.emplace(mTokenizer.nextToken()); + } + } + + Tokenizer mTokenizer; + mozilla::Maybe<DependentSubstringType> mCurrentToken; + }; + + auto begin() const { return Iterator{mTokenizer}; } + auto end() const { return EndSentinel{}; } + + private: + const Tokenizer mTokenizer; +}; + +template <typename TDependentSubstringType, bool IsWhitespace(char16_t), + nsTokenizerFlags Flags> +auto nsTCharSeparatedTokenizer<TDependentSubstringType, IsWhitespace, + Flags>::ToRange() const { + return nsTokenizedRange{nsTCharSeparatedTokenizer{*this}}; +} + +// You should not need to instantiate this class directly. +// Use nsTSubstring::Split instead. +template <typename T> +class nsTSubstringSplitter + : public nsTokenizedRange<nsTCharSeparatedTokenizerTemplate< + NS_TokenizerIgnoreNothing, T, + nsTokenizerFlags::IncludeEmptyTokenAtEnd>> { + public: + using nsTokenizedRange<nsTCharSeparatedTokenizerTemplate< + NS_TokenizerIgnoreNothing, T, + nsTokenizerFlags::IncludeEmptyTokenAtEnd>>::nsTokenizedRange; +}; + +extern template class nsTSubstringSplitter<char>; +extern template class nsTSubstringSplitter<char16_t>; + +#endif /* __nsCharSeparatedTokenizer_h */ |