/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef __nsCharSeparatedTokenizer_h #define __nsCharSeparatedTokenizer_h #include "mozilla/Maybe.h" #include "mozilla/RangedPtr.h" #include "mozilla/TypedEnumBits.h" #include "nsCRTGlue.h" #include "nsTDependentSubstring.h" // Flags -- only one for now. If we need more, they should be defined to // be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.) enum class nsTokenizerFlags { Default = 0, SeparatorOptional = 1 << 0, IncludeEmptyTokenAtEnd = 1 << 1 }; MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(nsTokenizerFlags) /** * This parses a SeparatorChar-separated string into tokens. * Whitespace surrounding tokens is not treated as part of tokens, however * whitespace inside a token is. If the final token is the empty string, it is * not returned by default. * * Some examples, with SeparatorChar = ',': * * "foo, bar, baz" -> "foo" "bar" "baz" * "foo,bar,baz" -> "foo" "bar" "baz" * "foo , bar hi , baz" -> "foo" "bar hi" "baz" * "foo, ,bar,baz" -> "foo" "" "bar" "baz" * "foo,,bar,baz" -> "foo" "" "bar" "baz" * "foo,bar,baz," -> "foo" "bar" "baz" * * The function used for whitespace detection is a template argument. * By default, it is NS_IsAsciiWhitespace. */ template class nsTCharSeparatedTokenizer { using CharType = typename TDependentSubstringType::char_type; using SubstringType = typename TDependentSubstringType::substring_type; public: using DependentSubstringType = TDependentSubstringType; nsTCharSeparatedTokenizer(const SubstringType& aSource, CharType aSeparatorChar) : mIter(aSource.Data(), aSource.Length()), mEnd(aSource.Data() + aSource.Length(), aSource.Data(), aSource.Length()), mSeparatorChar(aSeparatorChar), mWhitespaceBeforeFirstToken(false), mWhitespaceAfterCurrentToken(false), mSeparatorAfterCurrentToken(false) { // Skip initial whitespace while (mIter < mEnd && IsWhitespace(*mIter)) { mWhitespaceBeforeFirstToken = true; ++mIter; } } /** * Checks if any more tokens are available. */ bool hasMoreTokens() const { MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), "Should be at beginning of token if there is one"); if constexpr (Flags & nsTokenizerFlags::IncludeEmptyTokenAtEnd) { return mIter < mEnd || (mIter == mEnd && mSeparatorAfterCurrentToken); } else { return mIter < mEnd; } } /* * Returns true if there is whitespace prior to the first token. */ bool whitespaceBeforeFirstToken() const { return mWhitespaceBeforeFirstToken; } /* * Returns true if there is a separator after the current token. * Useful if you want to check whether the last token has a separator * after it which may not be valid. */ bool separatorAfterCurrentToken() const { return mSeparatorAfterCurrentToken; } /* * Returns true if there is any whitespace after the current token. */ bool whitespaceAfterCurrentToken() const { return mWhitespaceAfterCurrentToken; } /** * Returns the next token. */ const DependentSubstringType nextToken() { mozilla::RangedPtr tokenStart = mIter; mozilla::RangedPtr tokenEnd = mIter; MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), "Should be at beginning of token if there is one"); // Search until we hit separator or end (or whitespace, if a separator // isn't required -- see clause with 'break' below). while (mIter < mEnd && *mIter != mSeparatorChar) { // Skip to end of the current word. while (mIter < mEnd && !IsWhitespace(*mIter) && *mIter != mSeparatorChar) { ++mIter; } tokenEnd = mIter; // Skip whitespace after the current word. mWhitespaceAfterCurrentToken = false; while (mIter < mEnd && IsWhitespace(*mIter)) { mWhitespaceAfterCurrentToken = true; ++mIter; } if constexpr (Flags & nsTokenizerFlags::SeparatorOptional) { // We've hit (and skipped) whitespace, and that's sufficient to end // our token, regardless of whether we've reached a SeparatorChar. break; } // (else, we'll keep looping until we hit mEnd or SeparatorChar) } mSeparatorAfterCurrentToken = (mIter != mEnd && *mIter == mSeparatorChar); MOZ_ASSERT((Flags & nsTokenizerFlags::SeparatorOptional) || (mSeparatorAfterCurrentToken == (mIter < mEnd)), "If we require a separator and haven't hit the end of " "our string, then we shouldn't have left the loop " "unless we hit a separator"); // Skip separator (and any whitespace after it), if we're at one. if (mSeparatorAfterCurrentToken) { ++mIter; while (mIter < mEnd && IsWhitespace(*mIter)) { mWhitespaceAfterCurrentToken = true; ++mIter; } } return Substring(tokenStart.get(), tokenEnd.get()); } auto ToRange() const; private: mozilla::RangedPtr mIter; const mozilla::RangedPtr mEnd; const CharType mSeparatorChar; bool mWhitespaceBeforeFirstToken; bool mWhitespaceAfterCurrentToken; bool mSeparatorAfterCurrentToken; }; constexpr bool NS_TokenizerIgnoreNothing(char16_t) { return false; } template using nsTCharSeparatedTokenizerTemplate = nsTCharSeparatedTokenizer, IsWhitespace, Flags>; template using nsCharSeparatedTokenizerTemplate = nsTCharSeparatedTokenizerTemplate; using nsCharSeparatedTokenizer = nsCharSeparatedTokenizerTemplate; template using nsCCharSeparatedTokenizerTemplate = nsTCharSeparatedTokenizerTemplate; using nsCCharSeparatedTokenizer = nsCCharSeparatedTokenizerTemplate; /** * Adapts a char separated tokenizer for use in a range-based for loop. * * Use this typically only indirectly, e.g. like * * for (const auto& token : nsCharSeparatedTokenizer(aText, ' ').ToRange()) { * // ... * } */ template class nsTokenizedRange { public: using DependentSubstringType = typename Tokenizer::DependentSubstringType; explicit nsTokenizedRange(Tokenizer&& aTokenizer) : mTokenizer(std::move(aTokenizer)) {} struct EndSentinel {}; struct Iterator { explicit Iterator(const Tokenizer& aTokenizer) : mTokenizer(aTokenizer) { Next(); } const DependentSubstringType& operator*() const { return *mCurrentToken; } Iterator& operator++() { Next(); return *this; } bool operator==(const EndSentinel&) const { return mCurrentToken.isNothing(); } bool operator!=(const EndSentinel&) const { return mCurrentToken.isSome(); } private: void Next() { mCurrentToken.reset(); if (mTokenizer.hasMoreTokens()) { mCurrentToken.emplace(mTokenizer.nextToken()); } } Tokenizer mTokenizer; mozilla::Maybe mCurrentToken; }; auto begin() const { return Iterator{mTokenizer}; } auto end() const { return EndSentinel{}; } private: const Tokenizer mTokenizer; }; template auto nsTCharSeparatedTokenizer::ToRange() const { return nsTokenizedRange{nsTCharSeparatedTokenizer{*this}}; } // You should not need to instantiate this class directly. // Use nsTSubstring::Split instead. template class nsTSubstringSplitter : public nsTokenizedRange> { public: using nsTokenizedRange>::nsTokenizedRange; }; extern template class nsTSubstringSplitter; extern template class nsTSubstringSplitter; #endif /* __nsCharSeparatedTokenizer_h */