diff options
Diffstat (limited to 'xpcom/ds/Tokenizer.cpp')
-rw-r--r-- | xpcom/ds/Tokenizer.cpp | 805 |
1 files changed, 805 insertions, 0 deletions
diff --git a/xpcom/ds/Tokenizer.cpp b/xpcom/ds/Tokenizer.cpp new file mode 100644 index 0000000000..11bdf81033 --- /dev/null +++ b/xpcom/ds/Tokenizer.cpp @@ -0,0 +1,805 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "Tokenizer.h" + +#include "nsUnicharUtils.h" +#include <algorithm> + +namespace mozilla { + +template <> +char const TokenizerBase<char>::sWhitespaces[] = {' ', '\t', 0}; +template <> +char16_t const TokenizerBase<char16_t>::sWhitespaces[3] = {' ', '\t', 0}; + +template <typename TChar> +static bool contains(TChar const* const list, TChar const needle) { + for (TChar const* c = list; *c; ++c) { + if (needle == *c) { + return true; + } + } + return false; +} + +template <typename TChar> +TTokenizer<TChar>::TTokenizer(const typename base::TAString& aSource, + const TChar* aWhitespaces, + const TChar* aAdditionalWordChars) + : TokenizerBase<TChar>(aWhitespaces, aAdditionalWordChars) { + base::mInputFinished = true; + aSource.BeginReading(base::mCursor); + mRecord = mRollback = base::mCursor; + aSource.EndReading(base::mEnd); +} + +template <typename TChar> +TTokenizer<TChar>::TTokenizer(const TChar* aSource, const TChar* aWhitespaces, + const TChar* aAdditionalWordChars) + : TTokenizer(typename base::TDependentString(aSource), aWhitespaces, + aAdditionalWordChars) {} + +template <typename TChar> +bool TTokenizer<TChar>::Next(typename base::Token& aToken) { + if (!base::HasInput()) { + base::mHasFailed = true; + return false; + } + + mRollback = base::mCursor; + base::mCursor = base::Parse(aToken); + + base::AssignFragment(aToken, mRollback, base::mCursor); + + base::mPastEof = aToken.Type() == base::TOKEN_EOF; + base::mHasFailed = false; + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::Check(const typename base::TokenType aTokenType, + typename base::Token& aResult) { + if (!base::HasInput()) { + base::mHasFailed = true; + return false; + } + + typename base::TAString::const_char_iterator next = base::Parse(aResult); + if (aTokenType != aResult.Type()) { + base::mHasFailed = true; + return false; + } + + mRollback = base::mCursor; + base::mCursor = next; + + base::AssignFragment(aResult, mRollback, base::mCursor); + + base::mPastEof = aResult.Type() == base::TOKEN_EOF; + base::mHasFailed = false; + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::Check(const typename base::Token& aToken) { +#ifdef DEBUG + base::Validate(aToken); +#endif + + if (!base::HasInput()) { + base::mHasFailed = true; + return false; + } + + typename base::Token parsed; + typename base::TAString::const_char_iterator next = base::Parse(parsed); + if (!aToken.Equals(parsed)) { + base::mHasFailed = true; + return false; + } + + mRollback = base::mCursor; + base::mCursor = next; + base::mPastEof = parsed.Type() == base::TOKEN_EOF; + base::mHasFailed = false; + return true; +} + +template <typename TChar> +void TTokenizer<TChar>::SkipWhites(WhiteSkipping aIncludeNewLines) { + if (!CheckWhite() && + (aIncludeNewLines == DONT_INCLUDE_NEW_LINE || !CheckEOL())) { + return; + } + + typename base::TAString::const_char_iterator rollback = mRollback; + while (CheckWhite() || (aIncludeNewLines == INCLUDE_NEW_LINE && CheckEOL())) { + } + + base::mHasFailed = false; + mRollback = rollback; +} + +template <typename TChar> +void TTokenizer<TChar>::SkipUntil(typename base::Token const& aToken) { + typename base::TAString::const_char_iterator rollback = base::mCursor; + const typename base::Token eof = base::Token::EndOfFile(); + + typename base::Token t; + while (Next(t)) { + if (aToken.Equals(t) || eof.Equals(t)) { + Rollback(); + break; + } + } + + mRollback = rollback; +} + +template <typename TChar> +bool TTokenizer<TChar>::CheckChar(bool (*aClassifier)(const TChar aChar)) { + if (!aClassifier) { + MOZ_ASSERT(false); + return false; + } + + if (!base::HasInput() || base::mCursor == base::mEnd) { + base::mHasFailed = true; + return false; + } + + if (!aClassifier(*base::mCursor)) { + base::mHasFailed = true; + return false; + } + + mRollback = base::mCursor; + ++base::mCursor; + base::mHasFailed = false; + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::CheckPhrase(const typename base::TAString& aPhrase) { + if (!base::HasInput()) { + return false; + } + + typedef typename base::TAString::const_char_iterator Cursor; + + TTokenizer<TChar> pattern(aPhrase); + MOZ_ASSERT(!pattern.CheckEOF(), + "This will return true but won't shift the Tokenizer's cursor"); + + return [&](Cursor cursor, Cursor rollback) mutable { + while (true) { + if (pattern.CheckEOF()) { + base::mHasFailed = false; + mRollback = cursor; + return true; + } + + typename base::Token t1, t2; + Unused << Next(t1); + Unused << pattern.Next(t2); + if (t1.Type() == t2.Type() && t1.Fragment().Equals(t2.Fragment())) { + continue; + } + + break; + } + + base::mHasFailed = true; + base::mPastEof = false; + base::mCursor = cursor; + mRollback = rollback; + return false; + }(base::mCursor, mRollback); +} + +template <typename TChar> +bool TTokenizer<TChar>::ReadChar(TChar* aValue) { + MOZ_RELEASE_ASSERT(aValue); + + typename base::Token t; + if (!Check(base::TOKEN_CHAR, t)) { + return false; + } + + *aValue = t.AsChar(); + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::ReadChar(bool (*aClassifier)(const TChar aChar), + TChar* aValue) { + MOZ_RELEASE_ASSERT(aValue); + + if (!CheckChar(aClassifier)) { + return false; + } + + *aValue = *mRollback; + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::ReadWord(typename base::TAString& aValue) { + typename base::Token t; + if (!Check(base::TOKEN_WORD, t)) { + return false; + } + + aValue.Assign(t.AsString()); + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::ReadWord(typename base::TDependentSubstring& aValue) { + typename base::Token t; + if (!Check(base::TOKEN_WORD, t)) { + return false; + } + + aValue.Rebind(t.AsString().BeginReading(), t.AsString().Length()); + return true; +} + +template <typename TChar> +bool TTokenizer<TChar>::ReadUntil(typename base::Token const& aToken, + typename base::TAString& aResult, + ClaimInclusion aInclude) { + typename base::TDependentSubstring substring; + bool rv = ReadUntil(aToken, substring, aInclude); + aResult.Assign(substring); + return rv; +} + +template <typename TChar> +bool TTokenizer<TChar>::ReadUntil(typename base::Token const& aToken, + typename base::TDependentSubstring& aResult, + ClaimInclusion aInclude) { + typename base::TAString::const_char_iterator record = mRecord; + Record(); + typename base::TAString::const_char_iterator rollback = mRollback = + base::mCursor; + + bool found = false; + typename base::Token t; + while (Next(t)) { + if (aToken.Equals(t)) { + found = true; + break; + } + if (t.Equals(base::Token::EndOfFile())) { + // We don't want to eat it. + Rollback(); + break; + } + } + + Claim(aResult, aInclude); + mRollback = rollback; + mRecord = record; + return found; +} + +template <typename TChar> +void TTokenizer<TChar>::Rollback() { + MOZ_ASSERT(base::mCursor > mRollback || base::mPastEof, "TODO!!!"); + + base::mPastEof = false; + base::mHasFailed = false; + base::mCursor = mRollback; +} + +template <typename TChar> +void TTokenizer<TChar>::Record(ClaimInclusion aInclude) { + mRecord = aInclude == INCLUDE_LAST ? mRollback : base::mCursor; +} + +template <typename TChar> +void TTokenizer<TChar>::Claim(typename base::TAString& aResult, + ClaimInclusion aInclusion) { + typename base::TAString::const_char_iterator close = + aInclusion == EXCLUDE_LAST ? mRollback : base::mCursor; + aResult.Assign(Substring(mRecord, close)); +} + +template <typename TChar> +void TTokenizer<TChar>::Claim(typename base::TDependentSubstring& aResult, + ClaimInclusion aInclusion) { + typename base::TAString::const_char_iterator close = + aInclusion == EXCLUDE_LAST ? mRollback : base::mCursor; + + MOZ_RELEASE_ASSERT(close >= mRecord, "Overflow!"); + aResult.Rebind(mRecord, close - mRecord); +} + +// TokenizerBase + +template <typename TChar> +TokenizerBase<TChar>::TokenizerBase(const TChar* aWhitespaces, + const TChar* aAdditionalWordChars) + : mPastEof(false), + mHasFailed(false), + mInputFinished(true), + mMode(Mode::FULL), + mMinRawDelivery(1024), + mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces), + mAdditionalWordChars(aAdditionalWordChars), + mCursor(nullptr), + mEnd(nullptr), + mNextCustomTokenID(TOKEN_CUSTOM0) {} + +template <typename TChar> +auto TokenizerBase<TChar>::AddCustomToken(const TAString& aValue, + ECaseSensitivity aCaseInsensitivity, + bool aEnabled) -> Token { + MOZ_ASSERT(!aValue.IsEmpty()); + + UniquePtr<Token>& t = *mCustomTokens.AppendElement(); + t = MakeUnique<Token>(); + + t->mType = static_cast<TokenType>(++mNextCustomTokenID); + t->mCustomCaseInsensitivity = aCaseInsensitivity; + t->mCustomEnabled = aEnabled; + t->mCustom.Assign(aValue); + return *t; +} + +template <typename TChar> +void TokenizerBase<TChar>::RemoveCustomToken(Token& aToken) { + if (aToken.mType == TOKEN_UNKNOWN) { + // Already removed + return; + } + + for (UniquePtr<Token> const& custom : mCustomTokens) { + if (custom->mType == aToken.mType) { + mCustomTokens.RemoveElement(custom); + aToken.mType = TOKEN_UNKNOWN; + return; + } + } + + MOZ_ASSERT(false, "Token to remove not found"); +} + +template <typename TChar> +void TokenizerBase<TChar>::EnableCustomToken(Token const& aToken, + bool aEnabled) { + if (aToken.mType == TOKEN_UNKNOWN) { + // Already removed + return; + } + + for (UniquePtr<Token> const& custom : mCustomTokens) { + if (custom->Type() == aToken.Type()) { + // This effectively destroys the token instance. + custom->mCustomEnabled = aEnabled; + return; + } + } + + MOZ_ASSERT(false, "Token to change not found"); +} + +template <typename TChar> +void TokenizerBase<TChar>::SetTokenizingMode(Mode aMode) { + mMode = aMode; +} + +template <typename TChar> +bool TokenizerBase<TChar>::HasFailed() const { + return mHasFailed; +} + +template <typename TChar> +bool TokenizerBase<TChar>::HasInput() const { + return !mPastEof; +} + +template <typename TChar> +auto TokenizerBase<TChar>::Parse(Token& aToken) const -> + typename TAString::const_char_iterator { + if (mCursor == mEnd) { + if (!mInputFinished) { + return mCursor; + } + + aToken = Token::EndOfFile(); + return mEnd; + } + + MOZ_RELEASE_ASSERT(mEnd >= mCursor, "Overflow!"); + typename TAString::size_type available = mEnd - mCursor; + + uint32_t longestCustom = 0; + for (UniquePtr<Token> const& custom : mCustomTokens) { + if (IsCustom(mCursor, *custom, &longestCustom)) { + aToken = *custom; + return mCursor + custom->mCustom.Length(); + } + } + + if (!mInputFinished && available < longestCustom) { + // Not enough data to deterministically decide. + return mCursor; + } + + typename TAString::const_char_iterator next = mCursor; + + if (mMode == Mode::CUSTOM_ONLY) { + // We have to do a brute-force search for all of the enabled custom + // tokens. + while (next < mEnd) { + ++next; + for (UniquePtr<Token> const& custom : mCustomTokens) { + if (IsCustom(next, *custom)) { + aToken = Token::Raw(); + return next; + } + } + } + + if (mInputFinished) { + // End of the data reached. + aToken = Token::Raw(); + return next; + } + + if (longestCustom < available && available > mMinRawDelivery) { + // We can return some data w/o waiting for either a custom token + // or call to FinishData() when we leave the tail where all the + // custom tokens potentially fit, so we can't lose only partially + // delivered tokens. This preserves reasonable granularity. + aToken = Token::Raw(); + return mEnd - longestCustom + 1; + } + + // Not enough data to deterministically decide. + return mCursor; + } + + enum State { + PARSE_INTEGER, + PARSE_WORD, + PARSE_CRLF, + PARSE_LF, + PARSE_WS, + PARSE_CHAR, + } state; + + if (IsWordFirst(*next)) { + state = PARSE_WORD; + } else if (IsNumber(*next)) { + state = PARSE_INTEGER; + } else if (contains(mWhitespaces, *next)) { // not UTF-8 friendly? + state = PARSE_WS; + } else if (*next == '\r') { + state = PARSE_CRLF; + } else if (*next == '\n') { + state = PARSE_LF; + } else { + state = PARSE_CHAR; + } + + mozilla::CheckedUint64 resultingNumber = 0; + + while (next < mEnd) { + switch (state) { + case PARSE_INTEGER: + // Keep it simple for now + resultingNumber *= 10; + resultingNumber += static_cast<uint64_t>(*next - '0'); + + ++next; + if (IsPending(next)) { + break; + } + if (IsEnd(next) || !IsNumber(*next)) { + if (!resultingNumber.isValid()) { + aToken = Token::Error(); + } else { + aToken = Token::Number(resultingNumber.value()); + } + return next; + } + break; + + case PARSE_WORD: + ++next; + if (IsPending(next)) { + break; + } + if (IsEnd(next) || !IsWord(*next)) { + aToken = Token::Word(Substring(mCursor, next)); + return next; + } + break; + + case PARSE_CRLF: + ++next; + if (IsPending(next)) { + break; + } + if (!IsEnd(next) && *next == '\n') { // LF is optional + ++next; + } + aToken = Token::NewLine(); + return next; + + case PARSE_LF: + ++next; + aToken = Token::NewLine(); + return next; + + case PARSE_WS: + ++next; + aToken = Token::Whitespace(); + return next; + + case PARSE_CHAR: + ++next; + aToken = Token::Char(*mCursor); + return next; + } // switch (state) + } // while (next < end) + + MOZ_ASSERT(!mInputFinished); + return mCursor; +} + +template <typename TChar> +bool TokenizerBase<TChar>::IsEnd( + const typename TAString::const_char_iterator& caret) const { + return caret == mEnd; +} + +template <typename TChar> +bool TokenizerBase<TChar>::IsPending( + const typename TAString::const_char_iterator& caret) const { + return IsEnd(caret) && !mInputFinished; +} + +template <typename TChar> +bool TokenizerBase<TChar>::IsWordFirst(const TChar aInput) const { + // TODO: make this fully work with unicode + return (ToLowerCase(static_cast<uint32_t>(aInput)) != + ToUpperCase(static_cast<uint32_t>(aInput))) || + '_' == aInput || + (mAdditionalWordChars ? contains(mAdditionalWordChars, aInput) + : false); +} + +template <typename TChar> +bool TokenizerBase<TChar>::IsWord(const TChar aInput) const { + return IsWordFirst(aInput) || IsNumber(aInput); +} + +template <typename TChar> +bool TokenizerBase<TChar>::IsNumber(const TChar aInput) const { + // TODO: are there unicode numbers? + return aInput >= '0' && aInput <= '9'; +} + +template <typename TChar> +bool TokenizerBase<TChar>::IsCustom( + const typename TAString::const_char_iterator& caret, + const Token& aCustomToken, uint32_t* aLongest) const { + MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0); + if (!aCustomToken.mCustomEnabled) { + return false; + } + + if (aLongest) { + *aLongest = std::max(*aLongest, aCustomToken.mCustom.Length()); + } + + // This is not very likely to happen according to how we call this method + // and since it's on a hot path, it's just a diagnostic assert, + // not a release assert. + MOZ_DIAGNOSTIC_ASSERT(mEnd >= caret, "Overflow?"); + uint32_t inputLength = mEnd - caret; + if (aCustomToken.mCustom.Length() > inputLength) { + return false; + } + + TDependentSubstring inputFragment(caret, aCustomToken.mCustom.Length()); + if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) { + if constexpr (std::is_same_v<TChar, char>) { + return inputFragment.Equals(aCustomToken.mCustom, + nsCaseInsensitiveUTF8StringComparator); + } else { + return inputFragment.Equals(aCustomToken.mCustom, + nsCaseInsensitiveStringComparator); + } + } + return inputFragment.Equals(aCustomToken.mCustom); +} + +template <typename TChar> +void TokenizerBase<TChar>::AssignFragment( + Token& aToken, typename TAString::const_char_iterator begin, + typename TAString::const_char_iterator end) { + aToken.AssignFragment(begin, end); +} + +#ifdef DEBUG + +template <typename TChar> +void TokenizerBase<TChar>::Validate(Token const& aToken) { + if (aToken.Type() == TOKEN_WORD) { + typename TAString::const_char_iterator c = aToken.AsString().BeginReading(); + typename TAString::const_char_iterator e = aToken.AsString().EndReading(); + + if (c < e) { + MOZ_ASSERT(IsWordFirst(*c)); + while (++c < e) { + MOZ_ASSERT(IsWord(*c)); + } + } + } +} + +#endif + +// TokenizerBase::Token + +template <typename TChar> +TokenizerBase<TChar>::Token::Token() + : mType(TOKEN_UNKNOWN), + mChar(0), + mInteger(0), + mCustomCaseInsensitivity(CASE_SENSITIVE), + mCustomEnabled(false) {} + +template <typename TChar> +TokenizerBase<TChar>::Token::Token(const Token& aOther) + : mType(aOther.mType), + mCustom(aOther.mCustom), + mChar(aOther.mChar), + mInteger(aOther.mInteger), + mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity), + mCustomEnabled(aOther.mCustomEnabled) { + if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) { + mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length()); + } +} + +template <typename TChar> +auto TokenizerBase<TChar>::Token::operator=(const Token& aOther) -> Token& { + mType = aOther.mType; + mCustom = aOther.mCustom; + mChar = aOther.mChar; + mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length()); + mInteger = aOther.mInteger; + mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity; + mCustomEnabled = aOther.mCustomEnabled; + return *this; +} + +template <typename TChar> +void TokenizerBase<TChar>::Token::AssignFragment( + typename TAString::const_char_iterator begin, + typename TAString::const_char_iterator end) { + MOZ_RELEASE_ASSERT(end >= begin, "Overflow!"); + mFragment.Rebind(begin, end - begin); +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::Raw() -> Token { + Token t; + t.mType = TOKEN_RAW; + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::Word(TAString const& aValue) -> Token { + Token t; + t.mType = TOKEN_WORD; + t.mWord.Rebind(aValue.BeginReading(), aValue.Length()); + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::Char(TChar const aValue) -> Token { + Token t; + t.mType = TOKEN_CHAR; + t.mChar = aValue; + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::Number(uint64_t const aValue) -> Token { + Token t; + t.mType = TOKEN_INTEGER; + t.mInteger = aValue; + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::Whitespace() -> Token { + Token t; + t.mType = TOKEN_WS; + t.mChar = '\0'; + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::NewLine() -> Token { + Token t; + t.mType = TOKEN_EOL; + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::EndOfFile() -> Token { + Token t; + t.mType = TOKEN_EOF; + return t; +} + +// static +template <typename TChar> +auto TokenizerBase<TChar>::Token::Error() -> Token { + Token t; + t.mType = TOKEN_ERROR; + return t; +} + +template <typename TChar> +bool TokenizerBase<TChar>::Token::Equals(const Token& aOther) const { + if (mType != aOther.mType) { + return false; + } + + switch (mType) { + case TOKEN_INTEGER: + return AsInteger() == aOther.AsInteger(); + case TOKEN_WORD: + return AsString() == aOther.AsString(); + case TOKEN_CHAR: + return AsChar() == aOther.AsChar(); + default: + return true; + } +} + +template <typename TChar> +TChar TokenizerBase<TChar>::Token::AsChar() const { + MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS); + return mChar; +} + +template <typename TChar> +auto TokenizerBase<TChar>::Token::AsString() const -> TDependentSubstring { + MOZ_ASSERT(mType == TOKEN_WORD); + return mWord; +} + +template <typename TChar> +uint64_t TokenizerBase<TChar>::Token::AsInteger() const { + MOZ_ASSERT(mType == TOKEN_INTEGER); + return mInteger; +} + +template class TokenizerBase<char>; +template class TokenizerBase<char16_t>; + +template class TTokenizer<char>; +template class TTokenizer<char16_t>; + +} // namespace mozilla |