diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /extensions/spellcheck/src/mozInlineSpellWordUtil.cpp | |
parent | Initial commit. (diff) | |
download | firefox-e51783d008170d9ab27d25da98ca3a38b0a41b67.tar.xz firefox-e51783d008170d9ab27d25da98ca3a38b0a41b67.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'extensions/spellcheck/src/mozInlineSpellWordUtil.cpp')
-rw-r--r-- | extensions/spellcheck/src/mozInlineSpellWordUtil.cpp | 1174 |
1 files changed, 1174 insertions, 0 deletions
diff --git a/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp new file mode 100644 index 0000000000..17a661ade3 --- /dev/null +++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp @@ -0,0 +1,1174 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozInlineSpellWordUtil.h" + +#include <algorithm> +#include <utility> + +#include "mozilla/BinarySearch.h" +#include "mozilla/EditorBase.h" +#include "mozilla/HTMLEditor.h" +#include "mozilla/Logging.h" +#include "mozilla/dom/Element.h" + +#include "nsDebug.h" +#include "nsAtom.h" +#include "nsComponentManagerUtils.h" +#include "nsUnicodeProperties.h" +#include "nsServiceManagerUtils.h" +#include "nsIContent.h" +#include "nsTextFragment.h" +#include "nsRange.h" +#include "nsContentUtils.h" +#include "nsIFrame.h" + +using namespace mozilla; + +static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"}; + +// IsIgnorableCharacter +// +// These characters are ones that we should ignore in input. + +inline bool IsIgnorableCharacter(char ch) { + return (ch == static_cast<char>(0xAD)); // SOFT HYPHEN +} + +inline bool IsIgnorableCharacter(char16_t ch) { + return (ch == 0xAD || // SOFT HYPHEN + ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN +} + +// IsConditionalPunctuation +// +// Some characters (like apostrophes) require characters on each side to be +// part of a word, and are otherwise punctuation. + +inline bool IsConditionalPunctuation(char ch) { + return (ch == '\'' || // RIGHT SINGLE QUOTATION MARK + ch == static_cast<char>(0xB7)); // MIDDLE DOT +} + +inline bool IsConditionalPunctuation(char16_t ch) { + return (ch == '\'' || ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK + ch == 0x00B7); // MIDDLE DOT +} + +static bool IsAmbiguousDOMWordSeprator(char16_t ch) { + // This class may be CHAR_CLASS_SEPARATOR, but it depends on context. + return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' || + IsConditionalPunctuation(ch)); +} + +static bool IsAmbiguousDOMWordSeprator(char ch) { + // This class may be CHAR_CLASS_SEPARATOR, but it depends on context. + return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch)); +} + +// IsDOMWordSeparator +// +// Determines if the given character should be considered as a DOM Word +// separator. Basically, this is whitespace, although it could also have +// certain punctuation that we know ALWAYS breaks words. This is important. +// For example, we can't have any punctuation that could appear in a URL +// or email address in this, because those need to always fit into a single +// DOM word. + +static bool IsDOMWordSeparator(char ch) { + // simple spaces or no-break space + return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || + ch == static_cast<char>(0xA0)); +} + +static bool IsDOMWordSeparator(char16_t ch) { + // simple spaces + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true; + + // complex spaces - check only if char isn't ASCII (uncommon) + if (ch >= 0xA0 && (ch == 0x00A0 || // NO-BREAK SPACE + ch == 0x2002 || // EN SPACE + ch == 0x2003 || // EM SPACE + ch == 0x2009 || // THIN SPACE + ch == 0x3000)) // IDEOGRAPHIC SPACE + return true; + + // otherwise not a space + return false; +} + +bool NodeOffset::operator==( + const mozilla::RangeBoundary& aRangeBoundary) const { + if (aRangeBoundary.Container() != mNode) { + return false; + } + + const Maybe<uint32_t> rangeBoundaryOffset = + aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets); + + MOZ_ASSERT(mOffset >= 0); + return rangeBoundaryOffset && + (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset)); +} + +bool NodeOffsetRange::operator==(const nsRange& aRange) const { + return mBegin == aRange.StartRef() && mEnd == aRange.EndRef(); +} + +// static +Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create( + const EditorBase& aEditorBase) { + dom::Document* document = aEditorBase.GetDocument(); + if (NS_WARN_IF(!document)) { + return Nothing(); + } + + const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor(); + + // Find the root node for the editor. For contenteditable the mRootNode could + // change to shadow root if the begin and end are inside the shadowDOM. + nsINode* rootNode = aEditorBase.GetRoot(); + if (NS_WARN_IF(!rootNode)) { + return Nothing(); + } + + mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode, + *rootNode}; + return Some(std::move(util)); +} + +static inline bool IsSpellCheckingTextNode(nsINode* aNode) { + nsIContent* parent = aNode->GetParent(); + if (parent && + parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style)) + return false; + return aNode->IsText(); +} + +typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure); + +// Find the next node in the DOM tree in preorder. +// Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is +// why we can't just use GetNextNode here, sadly. +static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot, + OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) { + MOZ_ASSERT(aNode, "Null starting node?"); + + nsINode* next = aNode->GetFirstChild(); + if (next) return next; + + // Don't look at siblings or otherwise outside of aRoot + if (aNode == aRoot) return nullptr; + + next = aNode->GetNextSibling(); + if (next) return next; + + // Go up + for (;;) { + if (aOnLeaveNode) { + aOnLeaveNode(aNode, aClosure); + } + + next = aNode->GetParent(); + if (next == aRoot || !next) return nullptr; + aNode = next; + + next = aNode->GetNextSibling(); + if (next) return next; + } +} + +// aNode is not a text node. Find the first text node starting at aNode/aOffset +// in a preorder DOM traversal. +static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset, + const nsINode* aRoot) { + MOZ_ASSERT(aNode, "Null starting node?"); + MOZ_ASSERT(!IsSpellCheckingTextNode(aNode), + "FindNextTextNode should start with a non-text node"); + + nsINode* checkNode; + // Need to start at the aOffset'th child + nsIContent* child = aNode->GetChildAt_Deprecated(aOffset); + + if (child) { + checkNode = child; + } else { + // aOffset was beyond the end of the child list. + // goto next node after the last descendant of aNode in + // a preorder DOM traversal. + checkNode = aNode->GetNextNonChildNode(aRoot); + } + + while (checkNode && !IsSpellCheckingTextNode(checkNode)) { + checkNode = checkNode->GetNextNode(aRoot); + } + return checkNode; +} + +// mozInlineSpellWordUtil::SetPositionAndEnd +// +// We have two ranges "hard" and "soft". The hard boundary is simply +// the scope of the root node. The soft boundary is that which is set +// by the caller of this class by calling this function. If this function is +// not called, the soft boundary is the same as the hard boundary. +// +// When we reach the soft boundary (mSoftText.GetEnd()), we keep +// going until we reach the end of a word. This allows the caller to set the +// end of the range to anything, and we will always check whole multiples of +// words. When we reach the hard boundary we stop no matter what. +// +// There is no beginning soft boundary. This is because we only go to the +// previous node once, when finding the previous word boundary in +// SetPosition(). You might think of the soft boundary as being this initial +// position. + +nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode, + int32_t aPositionOffset, + nsINode* aEndNode, + int32_t aEndOffset) { + MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, + ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode, + aPositionOffset, aEndNode, aEndOffset)); + + MOZ_ASSERT(aPositionNode, "Null begin node?"); + MOZ_ASSERT(aEndNode, "Null end node?"); + + MOZ_ASSERT(mRootNode, "Not initialized"); + + // Find a appropriate root if we are dealing with contenteditable nodes which + // are in the shadow DOM. + if (mIsContentEditableOrDesignMode) { + nsINode* rootNode = aPositionNode->SubtreeRoot(); + if (rootNode != aEndNode->SubtreeRoot()) { + return NS_ERROR_FAILURE; + } + + if (mozilla::dom::ShadowRoot::FromNode(rootNode)) { + mRootNode = rootNode; + } + } + + mSoftText.Invalidate(); + + if (!IsSpellCheckingTextNode(aPositionNode)) { + // Start at the start of the first text node after aNode/aOffset. + aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode); + aPositionOffset = 0; + } + NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset); + + if (!IsSpellCheckingTextNode(aEndNode)) { + // End at the start of the first text node after aEndNode/aEndOffset. + aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode); + aEndOffset = 0; + } + NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset); + + nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd)); + if (NS_FAILED(rv)) { + return rv; + } + + int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin()); + if (textOffset < 0) { + return NS_OK; + } + + mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true); + return NS_OK; +} + +nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin, + NodeOffset aSoftEnd) { + if (mSoftText.mIsValid) return NS_OK; + mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd), + mRootNode); + + mRealWords.Clear(); + Result<RealWords, nsresult> realWords = BuildRealWords(); + if (realWords.isErr()) { + return realWords.unwrapErr(); + } + + mRealWords = realWords.unwrap(); + mSoftText.mIsValid = true; + return NS_OK; +} + +nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, + nsRange** aRange) const { + NodeOffset begin = + MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN); + NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END); + return MakeRange(begin, end, aRange); +} +void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord( + const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) { + NodeOffset begin = + MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN); + NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END); + *aNodeOffsetRange = NodeOffsetRange(begin, end); +} + +// mozInlineSpellWordUtil::GetRangeForWord + +nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode, + int32_t aWordOffset, + nsRange** aRange) { + // Set our soft end and start + NodeOffset pt(aWordNode, aWordOffset); + + if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() || + pt != mSoftText.GetEnd()) { + mSoftText.Invalidate(); + NodeOffset softBegin = pt; + NodeOffset softEnd = pt; + nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd)); + if (NS_FAILED(rv)) { + return rv; + } + } + + int32_t offset = MapDOMPositionToSoftTextOffset(pt); + if (offset < 0) return MakeRange(pt, pt, aRange); + int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false); + if (wordIndex < 0) return MakeRange(pt, pt, aRange); + return MakeRangeForWord(mRealWords[wordIndex], aRange); +} + +// This is to fix characters that the spellchecker may not like +static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen, + nsAString& aOutput) { + aOutput.Truncate(); + for (int32_t i = 0; i < aLen; i++) { + char16_t ch = aInput.CharAt(i + aPos); + + // remove ignorable characters from the word + if (IsIgnorableCharacter(ch)) continue; + + // the spellchecker doesn't handle curly apostrophes in all languages + if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK + ch = '\''; + } + + aOutput.Append(ch); + } +} + +// mozInlineSpellWordUtil::GetNextWord +// +// FIXME-optimization: we shouldn't have to generate a range every single +// time. It would be better if the inline spellchecker didn't require a +// range unless the word was misspelled. This may or may not be possible. + +bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) { + MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, + ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex)); + + if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) { + mNextWordIndex = -1; + aWord.mSkipChecking = true; + return false; + } + + const RealWord& realWord = mRealWords[mNextWordIndex]; + MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange); + ++mNextWordIndex; + aWord.mSkipChecking = !realWord.mCheckableWord; + ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset, + realWord.mLength, aWord.mText); + + MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, + ("%s: returning: %s (skip=%d)", __FUNCTION__, + NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking)); + + return true; +} + +// mozInlineSpellWordUtil::MakeRange +// +// Convenience function for creating a range over the current document. + +nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd, + nsRange** aRange) const { + NS_ENSURE_ARG_POINTER(aBegin.mNode); + if (!mDocument) { + return NS_ERROR_NOT_INITIALIZED; + } + + ErrorResult error; + RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset, + aEnd.mNode, aEnd.mOffset, error); + if (NS_WARN_IF(error.Failed())) { + return error.StealNSResult(); + } + MOZ_ASSERT(range); + range.forget(aRange); + return NS_OK; +} + +// static +already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange( + const NodeOffsetRange& aRange) { + IgnoredErrorResult ignoredError; + RefPtr<nsRange> range = + nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(), + aRange.End().Node(), aRange.End().Offset(), ignoredError); + NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed"); + return range.forget(); +} + +/*********** Word Splitting ************/ + +// classifies a given character in the DOM word +enum CharClass { + CHAR_CLASS_WORD, + CHAR_CLASS_SEPARATOR, + CHAR_CLASS_END_OF_INPUT +}; + +// Encapsulates DOM-word to real-word splitting +template <class T> +struct MOZ_STACK_CLASS WordSplitState { + const T& mDOMWordText; + int32_t mDOMWordOffset; + CharClass mCurCharClass; + + explicit WordSplitState(const T& aString) + : mDOMWordText(aString), + mDOMWordOffset(0), + mCurCharClass(CHAR_CLASS_END_OF_INPUT) {} + + CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const; + void Advance(); + void AdvanceThroughSeparators(); + void AdvanceThroughWord(); + + // Finds special words like email addresses and URLs that may start at the + // current position, and returns their length, or 0 if not found. This allows + // arbitrary word breaking rules to be used for these special entities, as + // long as they can not contain whitespace. + bool IsSpecialWord() const; + + // Similar to IsSpecialWord except that this takes a split word as + // input. This checks for things that do not require special word-breaking + // rules. + bool ShouldSkipWord(int32_t aStart, int32_t aLength) const; + + // Finds the last sequence of DOM word separators before aBeforeOffset and + // returns the offset to its first element. + Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence( + int32_t aBeforeOffset) const; + + char16_t GetUnicharAt(int32_t aIndex) const; +}; + +// WordSplitState::ClassifyCharacter +template <class T> +CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex, + bool aRecurse) const { + MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()), + "Index out of range"); + if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR; + + // this will classify the character, we want to treat "ignorable" characters + // such as soft hyphens, and also ZWJ and ZWNJ as word characters. + nsUGenCategory charCategory = + mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex)); + if (charCategory == nsUGenCategory::kLetter || + IsIgnorableCharacter(mDOMWordText[aIndex]) || + mDOMWordText[aIndex] == 0x200C /* ZWNJ */ || + mDOMWordText[aIndex] == 0x200D /* ZWJ */) + return CHAR_CLASS_WORD; + + // If conditional punctuation is surrounded immediately on both sides by word + // characters it also counts as a word character. + if (IsConditionalPunctuation(mDOMWordText[aIndex])) { + if (!aRecurse) { + // not allowed to look around, this punctuation counts like a separator + return CHAR_CLASS_SEPARATOR; + } + + // check the left-hand character + if (aIndex == 0) return CHAR_CLASS_SEPARATOR; + if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) + return CHAR_CLASS_SEPARATOR; + // If the previous charatcer is a word-char, make sure that it's not a + // special dot character. + if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR; + + // now we know left char is a word-char, check the right-hand character + if (aIndex == int32_t(mDOMWordText.Length() - 1)) { + return CHAR_CLASS_SEPARATOR; + } + + if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD) + return CHAR_CLASS_SEPARATOR; + // If the next charatcer is a word-char, make sure that it's not a + // special dot character. + if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR; + + // char on either side is a word, this counts as a word + return CHAR_CLASS_WORD; + } + + // The dot character, if appearing at the end of a word, should + // be considered part of that word. Example: "etc.", or + // abbreviations + if (aIndex > 0 && mDOMWordText[aIndex] == '.' && + mDOMWordText[aIndex - 1] != '.' && + ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) { + return CHAR_CLASS_WORD; + } + + // all other punctuation + if (charCategory == nsUGenCategory::kSeparator || + charCategory == nsUGenCategory::kOther || + charCategory == nsUGenCategory::kPunctuation || + charCategory == nsUGenCategory::kSymbol) { + // Don't break on hyphens, as hunspell handles them on its own. + if (aIndex > 0 && mDOMWordText[aIndex] == '-' && + mDOMWordText[aIndex - 1] != '-' && + ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) { + // A hyphen is only meaningful as a separator inside a word + // if the previous and next characters are a word character. + if (aIndex == int32_t(mDOMWordText.Length()) - 1) + return CHAR_CLASS_SEPARATOR; + if (mDOMWordText[aIndex + 1] != '.' && + ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD) + return CHAR_CLASS_WORD; + } + return CHAR_CLASS_SEPARATOR; + } + + // any other character counts as a word + return CHAR_CLASS_WORD; +} + +// WordSplitState::Advance +template <class T> +void WordSplitState<T>::Advance() { + MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index"); + MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(), + "Length beyond end"); + + mDOMWordOffset++; + if (mDOMWordOffset >= (int32_t)mDOMWordText.Length()) + mCurCharClass = CHAR_CLASS_END_OF_INPUT; + else + mCurCharClass = ClassifyCharacter(mDOMWordOffset, true); +} + +// WordSplitState::AdvanceThroughSeparators +template <class T> +void WordSplitState<T>::AdvanceThroughSeparators() { + while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance(); +} + +// WordSplitState::AdvanceThroughWord +template <class T> +void WordSplitState<T>::AdvanceThroughWord() { + while (mCurCharClass == CHAR_CLASS_WORD) Advance(); +} + +// WordSplitState::IsSpecialWord +template <class T> +bool WordSplitState<T>::IsSpecialWord() const { + // Search for email addresses. We simply define these as any sequence of + // characters with an '@' character in the middle. The DOM word is already + // split on whitepace, so we know that everything to the end is the address + int32_t firstColon = -1; + for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) { + if (mDOMWordText[i] == '@') { + // only accept this if there are unambiguous word characters (don't bother + // recursing to disambiguate apostrophes) on each side. This prevents + // classifying, e.g. "@home" as an email address + + // Use this condition to only accept words with '@' in the middle of + // them. It works, but the inlinespellcker doesn't like this. The problem + // is that you type "fhsgfh@" that's a misspelled word followed by a + // symbol, but when you type another letter "fhsgfh@g" that first word + // need to be unmarked misspelled. It doesn't do this. it only checks the + // current position for potentially removing a spelling range. + if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD && + i < (int32_t)mDOMWordText.Length() - 1 && + ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) { + return true; + } + } else if (mDOMWordText[i] == ':' && firstColon < 0) { + firstColon = i; + + // If the first colon is followed by a slash, consider it a URL + // This will catch things like asdf://foo.com + if (firstColon < (int32_t)mDOMWordText.Length() - 1 && + mDOMWordText[firstColon + 1] == '/') { + return true; + } + } + } + + // Check the text before the first colon against some known protocols. It + // is impossible to check against all protocols, especially since you can + // plug in new protocols. We also don't want to waste time here checking + // against a lot of obscure protocols. + if (firstColon > mDOMWordOffset) { + nsString protocol( + Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset)); + if (protocol.EqualsIgnoreCase("http") || + protocol.EqualsIgnoreCase("https") || + protocol.EqualsIgnoreCase("news") || + protocol.EqualsIgnoreCase("file") || + protocol.EqualsIgnoreCase("javascript") || + protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) { + return true; + } + } + + // not anything special + return false; +} + +// WordSplitState::ShouldSkipWord +template <class T> +bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const { + int32_t last = aStart + aLength; + + // check to see if the word contains a digit + for (int32_t i = aStart; i < last; i++) { + if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) == + nsUGenCategory::kNumber) { + return true; + } + } + + // not special + return false; +} + +template <class T> +Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence( + const int32_t aBeforeOffset) const { + for (int32_t i = aBeforeOffset - 1; i >= 0; --i) { + if (IsDOMWordSeparator(mDOMWordText[i]) || + (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) && + ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) { + // Be greedy, find as many separators as we can + for (int32_t j = i - 1; j >= 0; --j) { + if (IsDOMWordSeparator(mDOMWordText[j]) || + (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) && + ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) { + i = j; + } else { + break; + } + } + return Some(i); + } + } + return Nothing(); +} + +template <> +char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt( + int32_t aIndex) const { + return mDOMWordText[aIndex]; +} + +template <> +char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt( + int32_t aIndex) const { + return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex])); +} + +static inline bool IsBRElement(nsINode* aNode) { + return aNode->IsHTMLElement(nsGkAtoms::br); +} + +/** + * Given a TextNode, finds the last sequence of DOM word separators before + * aBeforeOffset and returns the offset to its first element. + * + * @param aContent the TextNode to check. + * @param aBeforeOffset the offset in the TextNode before which we will search + * for the DOM separator. You can pass INT32_MAX to search the entire + * length of the string. + */ +static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence( + nsIContent* aContent, int32_t aBeforeOffset) { + const nsTextFragment* textFragment = aContent->GetText(); + MOZ_ASSERT(textFragment, "Where is our text?"); + int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength())); + + if (textFragment->Is2b()) { + nsDependentSubstring targetText(textFragment->Get2b(), end); + WordSplitState<nsDependentSubstring> state(targetText); + return state.FindOffsetOfLastDOMWordSeparatorSequence(end); + } + + nsDependentCSubstring targetText(textFragment->Get1b(), end); + WordSplitState<nsDependentCSubstring> state(targetText); + return state.FindOffsetOfLastDOMWordSeparatorSequence(end); +} + +/** + * Check if there's a DOM word separator before aBeforeOffset in this node. + * Always returns true if it's a BR element. + * aSeparatorOffset is set to the index of the first character in the last + * separator if any is found (0 for BR elements). + * + * This function does not modify aSeparatorOffset when it returns false. + */ +static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset, + int32_t* aSeparatorOffset) { + if (IsBRElement(aNode)) { + *aSeparatorOffset = 0; + return true; + } + + if (!IsSpellCheckingTextNode(aNode)) return false; + + const Maybe<int32_t> separatorOffset = + FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(), + aBeforeOffset); + if (separatorOffset) { + *aSeparatorOffset = *separatorOffset; + return true; + } + + return false; +} + +static bool IsBreakElement(nsINode* aNode) { + if (!aNode->IsElement()) { + return false; + } + + dom::Element* element = aNode->AsElement(); + if (element->IsHTMLElement(nsGkAtoms::br)) { + return true; + } + + // If we don't have a frame, we don't consider ourselves a break + // element. In particular, words can span us. + nsIFrame* frame = element->GetPrimaryFrame(); + if (!frame) { + return false; + } + + auto* disp = frame->StyleDisplay(); + // Anything that's not an inline element is a break element. + // XXXbz should replaced inlines be break elements, though? + // Also should inline-block and such be break elements? + // + // FIXME(emilio): We should teach the spell checker to deal with generated + // content (it doesn't at all), then remove the IsListItem() check, as there + // could be no marker, etc... + return !disp->IsInlineFlow() || disp->IsListItem(); +} + +struct CheckLeavingBreakElementClosure { + bool mLeftBreakElement; +}; + +static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) { + CheckLeavingBreakElementClosure* cl = + static_cast<CheckLeavingBreakElementClosure*>(aClosure); + if (!cl->mLeftBreakElement && IsBreakElement(aNode)) { + cl->mLeftBreakElement = true; + } +} + +void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) { + nsAutoString result; + ::NormalizeWord(aWord, 0, aWord.Length(), result); + aWord = result; +} + +void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText( + NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) { + MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__)); + + mBegin = std::move(aBegin); + mEnd = std::move(aEnd); + + // First we have to work backwards from mBegin to find a text node + // containing a DOM word separator, a non-inline-element + // boundary, or the hard start node. That's where we'll start building the + // soft string from. + nsINode* node = mBegin.mNode; + int32_t firstOffsetInNode = 0; + int32_t checkBeforeOffset = mBegin.mOffset; + while (node) { + if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) { + if (node == mBegin.mNode) { + // If we find a word separator on the first node, look at the preceding + // word on the text node as well. + if (firstOffsetInNode > 0) { + // Try to find the previous word boundary in the current node. If + // we can't find one, start checking previous sibling nodes (if any + // adjacent ones exist) to see if we can find any text nodes with + // DOM word separators. We bail out as soon as we see a node that is + // not a text node, or we run out of previous sibling nodes. In the + // event that we simply cannot find any preceding word separator, the + // offset is set to 0, and the soft text beginning node is set to the + // "most previous" text node before the original starting node, or + // kept at the original starting node if no previous text nodes exist. + int32_t newOffset = 0; + if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1, + &newOffset)) { + nsIContent* prevNode = node->GetPreviousSibling(); + while (prevNode && IsSpellCheckingTextNode(prevNode)) { + mBegin.mNode = prevNode; + const Maybe<int32_t> separatorOffset = + FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX); + if (separatorOffset) { + newOffset = *separatorOffset; + break; + } + prevNode = prevNode->GetPreviousSibling(); + } + } + firstOffsetInNode = newOffset; + } else { + firstOffsetInNode = 0; + } + + MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, + ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__, + mBegin.mOffset, firstOffsetInNode)); + mBegin.mOffset = firstOffsetInNode; + } + break; + } + checkBeforeOffset = INT32_MAX; + if (IsBreakElement(node)) { + // Since GerPrevNode follows tree *preorder*, we're about to traverse up + // out of 'node'. Since node induces breaks (e.g., it's a block), don't + // bother trying to look outside it, just stop now. + break; + } + // GetPreviousContent below expects aRootNode to be an ancestor of node. + if (!node->IsInclusiveDescendantOf(aRootNode)) { + break; + } + node = node->GetPrevNode(aRootNode); + } + + // Now build up the string moving forward through the DOM until we reach + // the soft end and *then* see a DOM word separator, a non-inline-element + // boundary, or the hard end node. + mValue.Truncate(); + mDOMMapping.Clear(); + bool seenSoftEnd = false; + // Leave this outside the loop so large heap string allocations can be reused + // across iterations + while (node) { + if (node == mEnd.mNode) { + seenSoftEnd = true; + } + + bool exit = false; + if (IsSpellCheckingTextNode(node)) { + nsIContent* content = static_cast<nsIContent*>(node); + MOZ_ASSERT(content, "Where is our content?"); + const nsTextFragment* textFragment = content->GetText(); + MOZ_ASSERT(textFragment, "Where is our text?"); + uint32_t lastOffsetInNode = textFragment->GetLength(); + + if (seenSoftEnd) { + // check whether we can stop after this + for (uint32_t i = + node == mEnd.mNode ? AssertedCast<uint32_t>(mEnd.mOffset) : 0; + i < textFragment->GetLength(); ++i) { + if (IsDOMWordSeparator(textFragment->CharAt(i))) { + exit = true; + // stop at the first separator after the soft end point + lastOffsetInNode = i; + break; + } + } + } + + if (firstOffsetInNode >= 0 && + static_cast<uint32_t>(firstOffsetInNode) < lastOffsetInNode) { + const uint32_t len = lastOffsetInNode - firstOffsetInNode; + mDOMMapping.AppendElement(DOMTextMapping( + NodeOffset(node, firstOffsetInNode), mValue.Length(), len)); + + const bool ok = textFragment->AppendTo( + mValue, static_cast<uint32_t>(firstOffsetInNode), len, + mozilla::fallible); + if (!ok) { + // probably out of memory, remove from mDOMMapping + mDOMMapping.RemoveLastElement(); + exit = true; + } + } + + firstOffsetInNode = 0; + } + + if (exit) break; + + CheckLeavingBreakElementClosure closure = {false}; + node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure); + if (closure.mLeftBreakElement || (node && IsBreakElement(node))) { + // We left, or are entering, a break element (e.g., block). Maybe we can + // stop now. + if (seenSoftEnd) break; + // Record the break + mValue.Append(' '); + } + } + + MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, + ("%s: got DOM string: %s", __FUNCTION__, + NS_ConvertUTF16toUTF8(mValue).get())); +} + +auto mozInlineSpellWordUtil::BuildRealWords() const + -> Result<RealWords, nsresult> { + // This is pretty simple. We just have to walk mSoftText.GetValue(), + // tokenizing it into "real words". We do an outer traversal of words + // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of + // those DOM words + int32_t wordStart = -1; + RealWords realWords; + for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) { + if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) { + if (wordStart >= 0) { + nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords); + if (NS_FAILED(rv)) { + return Err(rv); + } + wordStart = -1; + } + } else { + if (wordStart < 0) { + wordStart = i; + } + } + } + if (wordStart >= 0) { + nsresult rv = SplitDOMWordAndAppendTo( + wordStart, mSoftText.GetValue().Length(), realWords); + if (NS_FAILED(rv)) { + return Err(rv); + } + } + + return realWords; +} + +/*********** DOM/realwords<->mSoftText.GetValue() mapping functions + * ************/ + +int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset( + const NodeOffset& aNodeOffset) const { + if (!mSoftText.mIsValid) { + NS_ERROR("Soft text must be valid if we're to map into it"); + return -1; + } + + for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) { + const DOMTextMapping& map = mSoftText.GetDOMMapping()[i]; + if (map.mNodeOffset.mNode == aNodeOffset.mNode) { + // Allow offsets at either end of the string, in particular, allow the + // offset that's at the end of the contributed string + int32_t offsetInContributedString = + aNodeOffset.mOffset - map.mNodeOffset.mOffset; + if (offsetInContributedString >= 0 && + offsetInContributedString <= map.mLength) + return map.mSoftTextOffset + offsetInContributedString; + return -1; + } + } + return -1; +} + +namespace { + +template <class T> +class FirstLargerOffset { + int32_t mSoftTextOffset; + + public: + explicit FirstLargerOffset(int32_t aSoftTextOffset) + : mSoftTextOffset(aSoftTextOffset) {} + int operator()(const T& t) const { + // We want the first larger offset, so never return 0 (which would + // short-circuit evaluation before finding the last such offset). + return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1; + } +}; + +template <class T> +bool FindLastNongreaterOffset(const nsTArray<T>& aContainer, + int32_t aSoftTextOffset, size_t* aIndex) { + if (aContainer.Length() == 0) { + return false; + } + + BinarySearchIf(aContainer, 0, aContainer.Length(), + FirstLargerOffset<T>(aSoftTextOffset), aIndex); + if (*aIndex > 0) { + // There was at least one mapping with offset <= aSoftTextOffset. Step back + // to find the last element with |mSoftTextOffset <= aSoftTextOffset|. + *aIndex -= 1; + } else { + // Every mapping had offset greater than aSoftTextOffset. + MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset); + } + return true; +} + +} // namespace + +NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition( + int32_t aSoftTextOffset, DOMMapHint aHint) const { + MOZ_ASSERT(mSoftText.mIsValid, + "Soft text must be valid if we're to map out of it"); + if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1); + + // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset + size_t index; + bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(), + aSoftTextOffset, &index); + if (!found) { + return NodeOffset(nullptr, -1); + } + + // 'index' is now the last mapping, if any, such that + // mSoftTextOffset <= aSoftTextOffset. + // If we're doing HINT_END, then we may want to return the end of the + // the previous mapping instead of the start of this mapping + if (aHint == HINT_END && index > 0) { + const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1]; + if (map.mSoftTextOffset + map.mLength == aSoftTextOffset) + return NodeOffset(map.mNodeOffset.mNode, + map.mNodeOffset.mOffset + map.mLength); + } + + // We allow ourselves to return the end of this mapping even if we're + // doing HINT_START. This will only happen if there is no mapping which this + // point is the start of. I'm not 100% sure this is OK... + const DOMTextMapping& map = mSoftText.GetDOMMapping()[index]; + int32_t offset = aSoftTextOffset - map.mSoftTextOffset; + if (offset >= 0 && offset <= map.mLength) + return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset); + + return NodeOffset(nullptr, -1); +} + +// static +void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint, + nsACString& aResult) { + switch (aHint) { + case HINT_BEGIN: + aResult.AssignLiteral("begin"); + break; + case HINT_END: + aResult.AssignLiteral("end"); + break; + } +} + +int32_t mozInlineSpellWordUtil::FindRealWordContaining( + int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const { + if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) { + nsAutoCString hint; + mozInlineSpellWordUtil::ToString(aHint, hint); + + MOZ_LOG( + sInlineSpellWordUtilLog, LogLevel::Debug, + ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__, + aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward))); + } + + MOZ_ASSERT(mSoftText.mIsValid, + "Soft text must be valid if we're to map out of it"); + if (!mSoftText.mIsValid) return -1; + + // Find the last word, if any, such that mRealWords[index].mSoftTextOffset + // <= aSoftTextOffset + size_t index; + bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index); + if (!found) { + return -1; + } + + // 'index' is now the last word, if any, such that + // mSoftTextOffset <= aSoftTextOffset. + // If we're doing HINT_END, then we may want to return the end of the + // the previous word instead of the start of this word + if (aHint == HINT_END && index > 0) { + const RealWord& word = mRealWords[index - 1]; + if (word.EndOffset() == aSoftTextOffset) { + return index - 1; + } + } + + // We allow ourselves to return the end of this word even if we're + // doing HINT_BEGIN. This will only happen if there is no word which this + // point is the start of. I'm not 100% sure this is OK... + const RealWord& word = mRealWords[index]; + int32_t offset = aSoftTextOffset - word.mSoftTextOffset; + if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index; + + if (aSearchForward) { + if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) { + // All words have mSoftTextOffset > aSoftTextOffset + return 0; + } + // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset. + // Word index+1, if it exists, will be the first with + // mSoftTextOffset > aSoftTextOffset. + if (index + 1 < mRealWords.Length()) return index + 1; + } + + return -1; +} + +// mozInlineSpellWordUtil::SplitDOMWordAndAppendTo + +nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo( + int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const { + nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart); + WordSplitState<nsDependentSubstring> state(targetText); + state.mCurCharClass = state.ClassifyCharacter(0, true); + + state.AdvanceThroughSeparators(); + if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) { + int32_t specialWordLength = + state.mDOMWordText.Length() - state.mDOMWordOffset; + if (!aRealWords.AppendElement( + RealWord(aStart + state.mDOMWordOffset, specialWordLength, false), + fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + + return NS_OK; + } + + while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) { + state.AdvanceThroughSeparators(); + if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break; + + // save the beginning of the word + int32_t wordOffset = state.mDOMWordOffset; + + // find the end of the word + state.AdvanceThroughWord(); + int32_t wordLen = state.mDOMWordOffset - wordOffset; + if (!aRealWords.AppendElement( + RealWord(aStart + wordOffset, wordLen, + !state.ShouldSkipWord(wordOffset, wordLen)), + fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + } + + return NS_OK; +} |