summaryrefslogtreecommitdiffstats
path: root/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--extensions/spellcheck/src/mozInlineSpellWordUtil.cpp1174
1 files changed, 1174 insertions, 0 deletions
diff --git a/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
new file mode 100644
index 0000000000..731059f04b
--- /dev/null
+++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
@@ -0,0 +1,1174 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozInlineSpellWordUtil.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "mozilla/BinarySearch.h"
+#include "mozilla/EditorBase.h"
+#include "mozilla/HTMLEditor.h"
+#include "mozilla/Logging.h"
+#include "mozilla/dom/Element.h"
+
+#include "nsDebug.h"
+#include "nsAtom.h"
+#include "nsComponentManagerUtils.h"
+#include "nsUnicodeProperties.h"
+#include "nsServiceManagerUtils.h"
+#include "nsIContent.h"
+#include "nsTextFragment.h"
+#include "nsRange.h"
+#include "nsContentUtils.h"
+#include "nsIFrame.h"
+
+using namespace mozilla;
+
+static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};
+
+// IsIgnorableCharacter
+//
+// These characters are ones that we should ignore in input.
+
+inline bool IsIgnorableCharacter(char ch) {
+ return (ch == static_cast<char>(0xAD)); // SOFT HYPHEN
+}
+
+inline bool IsIgnorableCharacter(char16_t ch) {
+ return (ch == 0xAD || // SOFT HYPHEN
+ ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
+}
+
+// IsConditionalPunctuation
+//
+// Some characters (like apostrophes) require characters on each side to be
+// part of a word, and are otherwise punctuation.
+
+inline bool IsConditionalPunctuation(char ch) {
+ return (ch == '\'' || // RIGHT SINGLE QUOTATION MARK
+ ch == static_cast<char>(0xB7)); // MIDDLE DOT
+}
+
+inline bool IsConditionalPunctuation(char16_t ch) {
+ return (ch == '\'' || ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
+ ch == 0x00B7); // MIDDLE DOT
+}
+
+static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
+ // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
+ return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
+ IsConditionalPunctuation(ch));
+}
+
+static bool IsAmbiguousDOMWordSeprator(char ch) {
+ // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
+ return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
+}
+
+// IsDOMWordSeparator
+//
+// Determines if the given character should be considered as a DOM Word
+// separator. Basically, this is whitespace, although it could also have
+// certain punctuation that we know ALWAYS breaks words. This is important.
+// For example, we can't have any punctuation that could appear in a URL
+// or email address in this, because those need to always fit into a single
+// DOM word.
+
+static bool IsDOMWordSeparator(char ch) {
+ // simple spaces or no-break space
+ return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
+ ch == static_cast<char>(0xA0));
+}
+
+static bool IsDOMWordSeparator(char16_t ch) {
+ // simple spaces
+ if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
+
+ // complex spaces - check only if char isn't ASCII (uncommon)
+ if (ch >= 0xA0 && (ch == 0x00A0 || // NO-BREAK SPACE
+ ch == 0x2002 || // EN SPACE
+ ch == 0x2003 || // EM SPACE
+ ch == 0x2009 || // THIN SPACE
+ ch == 0x3000)) // IDEOGRAPHIC SPACE
+ return true;
+
+ // otherwise not a space
+ return false;
+}
+
+bool NodeOffset::operator==(
+ const mozilla::RangeBoundary& aRangeBoundary) const {
+ if (aRangeBoundary.Container() != mNode) {
+ return false;
+ }
+
+ const Maybe<uint32_t> rangeBoundaryOffset =
+ aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets);
+
+ MOZ_ASSERT(mOffset >= 0);
+ return rangeBoundaryOffset &&
+ (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset));
+}
+
+bool NodeOffsetRange::operator==(const nsRange& aRange) const {
+ return mBegin == aRange.StartRef() && mEnd == aRange.EndRef();
+}
+
+// static
+Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(
+ const EditorBase& aEditorBase) {
+ dom::Document* document = aEditorBase.GetDocument();
+ if (NS_WARN_IF(!document)) {
+ return Nothing();
+ }
+
+ const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor();
+
+ // Find the root node for the editor. For contenteditable the mRootNode could
+ // change to shadow root if the begin and end are inside the shadowDOM.
+ nsINode* rootNode = aEditorBase.GetRoot();
+ if (NS_WARN_IF(!rootNode)) {
+ return Nothing();
+ }
+
+ mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode,
+ *rootNode};
+ return Some(std::move(util));
+}
+
+static inline bool IsSpellCheckingTextNode(nsINode* aNode) {
+ nsIContent* parent = aNode->GetParent();
+ if (parent &&
+ parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
+ return false;
+ return aNode->IsText();
+}
+
+typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
+
+// Find the next node in the DOM tree in preorder.
+// Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
+// why we can't just use GetNextNode here, sadly.
+static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot,
+ OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {
+ MOZ_ASSERT(aNode, "Null starting node?");
+
+ nsINode* next = aNode->GetFirstChild();
+ if (next) return next;
+
+ // Don't look at siblings or otherwise outside of aRoot
+ if (aNode == aRoot) return nullptr;
+
+ next = aNode->GetNextSibling();
+ if (next) return next;
+
+ // Go up
+ for (;;) {
+ if (aOnLeaveNode) {
+ aOnLeaveNode(aNode, aClosure);
+ }
+
+ next = aNode->GetParent();
+ if (next == aRoot || !next) return nullptr;
+ aNode = next;
+
+ next = aNode->GetNextSibling();
+ if (next) return next;
+ }
+}
+
+// aNode is not a text node. Find the first text node starting at aNode/aOffset
+// in a preorder DOM traversal.
+static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,
+ const nsINode* aRoot) {
+ MOZ_ASSERT(aNode, "Null starting node?");
+ MOZ_ASSERT(!IsSpellCheckingTextNode(aNode),
+ "FindNextTextNode should start with a non-text node");
+
+ nsINode* checkNode;
+ // Need to start at the aOffset'th child
+ nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);
+
+ if (child) {
+ checkNode = child;
+ } else {
+ // aOffset was beyond the end of the child list.
+ // goto next node after the last descendant of aNode in
+ // a preorder DOM traversal.
+ checkNode = aNode->GetNextNonChildNode(aRoot);
+ }
+
+ while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
+ checkNode = checkNode->GetNextNode(aRoot);
+ }
+ return checkNode;
+}
+
+// mozInlineSpellWordUtil::SetPositionAndEnd
+//
+// We have two ranges "hard" and "soft". The hard boundary is simply
+// the scope of the root node. The soft boundary is that which is set
+// by the caller of this class by calling this function. If this function is
+// not called, the soft boundary is the same as the hard boundary.
+//
+// When we reach the soft boundary (mSoftText.GetEnd()), we keep
+// going until we reach the end of a word. This allows the caller to set the
+// end of the range to anything, and we will always check whole multiples of
+// words. When we reach the hard boundary we stop no matter what.
+//
+// There is no beginning soft boundary. This is because we only go to the
+// previous node once, when finding the previous word boundary in
+// SetPosition(). You might think of the soft boundary as being this initial
+// position.
+
+nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
+ int32_t aPositionOffset,
+ nsINode* aEndNode,
+ int32_t aEndOffset) {
+ MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
+ ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,
+ aPositionOffset, aEndNode, aEndOffset));
+
+ MOZ_ASSERT(aPositionNode, "Null begin node?");
+ MOZ_ASSERT(aEndNode, "Null end node?");
+
+ MOZ_ASSERT(mRootNode, "Not initialized");
+
+ // Find a appropriate root if we are dealing with contenteditable nodes which
+ // are in the shadow DOM.
+ if (mIsContentEditableOrDesignMode) {
+ nsINode* rootNode = aPositionNode->SubtreeRoot();
+ if (rootNode != aEndNode->SubtreeRoot()) {
+ return NS_ERROR_FAILURE;
+ }
+
+ if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
+ mRootNode = rootNode;
+ }
+ }
+
+ mSoftText.Invalidate();
+
+ if (!IsSpellCheckingTextNode(aPositionNode)) {
+ // Start at the start of the first text node after aNode/aOffset.
+ aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
+ aPositionOffset = 0;
+ }
+ NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset);
+
+ if (!IsSpellCheckingTextNode(aEndNode)) {
+ // End at the start of the first text node after aEndNode/aEndOffset.
+ aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
+ aEndOffset = 0;
+ }
+ NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset);
+
+ nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
+ if (NS_FAILED(rv)) {
+ return rv;
+ }
+
+ int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin());
+ if (textOffset < 0) {
+ return NS_OK;
+ }
+
+ mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
+ return NS_OK;
+}
+
+nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin,
+ NodeOffset aSoftEnd) {
+ if (mSoftText.mIsValid) return NS_OK;
+ mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd),
+ mRootNode);
+
+ mRealWords.Clear();
+ Result<RealWords, nsresult> realWords = BuildRealWords();
+ if (realWords.isErr()) {
+ return realWords.unwrapErr();
+ }
+
+ mRealWords = realWords.unwrap();
+ mSoftText.mIsValid = true;
+ return NS_OK;
+}
+
+nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,
+ nsRange** aRange) const {
+ NodeOffset begin =
+ MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
+ NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
+ return MakeRange(begin, end, aRange);
+}
+void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
+ const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {
+ NodeOffset begin =
+ MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
+ NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
+ *aNodeOffsetRange = NodeOffsetRange(begin, end);
+}
+
+// mozInlineSpellWordUtil::GetRangeForWord
+
+nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,
+ int32_t aWordOffset,
+ nsRange** aRange) {
+ // Set our soft end and start
+ NodeOffset pt(aWordNode, aWordOffset);
+
+ if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() ||
+ pt != mSoftText.GetEnd()) {
+ mSoftText.Invalidate();
+ NodeOffset softBegin = pt;
+ NodeOffset softEnd = pt;
+ nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
+ if (NS_FAILED(rv)) {
+ return rv;
+ }
+ }
+
+ int32_t offset = MapDOMPositionToSoftTextOffset(pt);
+ if (offset < 0) return MakeRange(pt, pt, aRange);
+ int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
+ if (wordIndex < 0) return MakeRange(pt, pt, aRange);
+ return MakeRangeForWord(mRealWords[wordIndex], aRange);
+}
+
+// This is to fix characters that the spellchecker may not like
+static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,
+ nsAString& aOutput) {
+ aOutput.Truncate();
+ for (int32_t i = 0; i < aLen; i++) {
+ char16_t ch = aInput.CharAt(i + aPos);
+
+ // remove ignorable characters from the word
+ if (IsIgnorableCharacter(ch)) continue;
+
+ // the spellchecker doesn't handle curly apostrophes in all languages
+ if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
+ ch = '\'';
+ }
+
+ aOutput.Append(ch);
+ }
+}
+
+// mozInlineSpellWordUtil::GetNextWord
+//
+// FIXME-optimization: we shouldn't have to generate a range every single
+// time. It would be better if the inline spellchecker didn't require a
+// range unless the word was misspelled. This may or may not be possible.
+
+bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) {
+ MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
+ ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));
+
+ if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {
+ mNextWordIndex = -1;
+ aWord.mSkipChecking = true;
+ return false;
+ }
+
+ const RealWord& realWord = mRealWords[mNextWordIndex];
+ MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange);
+ ++mNextWordIndex;
+ aWord.mSkipChecking = !realWord.mCheckableWord;
+ ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset,
+ realWord.mLength, aWord.mText);
+
+ MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
+ ("%s: returning: %s (skip=%d)", __FUNCTION__,
+ NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking));
+
+ return true;
+}
+
+// mozInlineSpellWordUtil::MakeRange
+//
+// Convenience function for creating a range over the current document.
+
+nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
+ nsRange** aRange) const {
+ NS_ENSURE_ARG_POINTER(aBegin.mNode);
+ if (!mDocument) {
+ return NS_ERROR_NOT_INITIALIZED;
+ }
+
+ ErrorResult error;
+ RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,
+ aEnd.mNode, aEnd.mOffset, error);
+ if (NS_WARN_IF(error.Failed())) {
+ return error.StealNSResult();
+ }
+ MOZ_ASSERT(range);
+ range.forget(aRange);
+ return NS_OK;
+}
+
+// static
+already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(
+ const NodeOffsetRange& aRange) {
+ IgnoredErrorResult ignoredError;
+ RefPtr<nsRange> range =
+ nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),
+ aRange.End().Node(), aRange.End().Offset(), ignoredError);
+ NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");
+ return range.forget();
+}
+
+/*********** Word Splitting ************/
+
+// classifies a given character in the DOM word
+enum CharClass {
+ CHAR_CLASS_WORD,
+ CHAR_CLASS_SEPARATOR,
+ CHAR_CLASS_END_OF_INPUT
+};
+
+// Encapsulates DOM-word to real-word splitting
+template <class T>
+struct MOZ_STACK_CLASS WordSplitState {
+ const T& mDOMWordText;
+ int32_t mDOMWordOffset;
+ CharClass mCurCharClass;
+
+ explicit WordSplitState(const T& aString)
+ : mDOMWordText(aString),
+ mDOMWordOffset(0),
+ mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
+
+ CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
+ void Advance();
+ void AdvanceThroughSeparators();
+ void AdvanceThroughWord();
+
+ // Finds special words like email addresses and URLs that may start at the
+ // current position, and returns their length, or 0 if not found. This allows
+ // arbitrary word breaking rules to be used for these special entities, as
+ // long as they can not contain whitespace.
+ bool IsSpecialWord() const;
+
+ // Similar to IsSpecialWord except that this takes a split word as
+ // input. This checks for things that do not require special word-breaking
+ // rules.
+ bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
+
+ // Finds the last sequence of DOM word separators before aBeforeOffset and
+ // returns the offset to its first element.
+ Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
+ int32_t aBeforeOffset) const;
+
+ char16_t GetUnicharAt(int32_t aIndex) const;
+};
+
+// WordSplitState::ClassifyCharacter
+template <class T>
+CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
+ bool aRecurse) const {
+ MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
+ "Index out of range");
+ if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
+
+ // this will classify the character, we want to treat "ignorable" characters
+ // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
+ nsUGenCategory charCategory =
+ mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
+ if (charCategory == nsUGenCategory::kLetter ||
+ IsIgnorableCharacter(mDOMWordText[aIndex]) ||
+ mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
+ mDOMWordText[aIndex] == 0x200D /* ZWJ */)
+ return CHAR_CLASS_WORD;
+
+ // If conditional punctuation is surrounded immediately on both sides by word
+ // characters it also counts as a word character.
+ if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
+ if (!aRecurse) {
+ // not allowed to look around, this punctuation counts like a separator
+ return CHAR_CLASS_SEPARATOR;
+ }
+
+ // check the left-hand character
+ if (aIndex == 0) return CHAR_CLASS_SEPARATOR;
+ if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
+ return CHAR_CLASS_SEPARATOR;
+ // If the previous charatcer is a word-char, make sure that it's not a
+ // special dot character.
+ if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;
+
+ // now we know left char is a word-char, check the right-hand character
+ if (aIndex == int32_t(mDOMWordText.Length() - 1)) {
+ return CHAR_CLASS_SEPARATOR;
+ }
+
+ if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
+ return CHAR_CLASS_SEPARATOR;
+ // If the next charatcer is a word-char, make sure that it's not a
+ // special dot character.
+ if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;
+
+ // char on either side is a word, this counts as a word
+ return CHAR_CLASS_WORD;
+ }
+
+ // The dot character, if appearing at the end of a word, should
+ // be considered part of that word. Example: "etc.", or
+ // abbreviations
+ if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&
+ mDOMWordText[aIndex - 1] != '.' &&
+ ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
+ return CHAR_CLASS_WORD;
+ }
+
+ // all other punctuation
+ if (charCategory == nsUGenCategory::kSeparator ||
+ charCategory == nsUGenCategory::kOther ||
+ charCategory == nsUGenCategory::kPunctuation ||
+ charCategory == nsUGenCategory::kSymbol) {
+ // Don't break on hyphens, as hunspell handles them on its own.
+ if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&
+ mDOMWordText[aIndex - 1] != '-' &&
+ ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
+ // A hyphen is only meaningful as a separator inside a word
+ // if the previous and next characters are a word character.
+ if (aIndex == int32_t(mDOMWordText.Length()) - 1)
+ return CHAR_CLASS_SEPARATOR;
+ if (mDOMWordText[aIndex + 1] != '.' &&
+ ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
+ return CHAR_CLASS_WORD;
+ }
+ return CHAR_CLASS_SEPARATOR;
+ }
+
+ // any other character counts as a word
+ return CHAR_CLASS_WORD;
+}
+
+// WordSplitState::Advance
+template <class T>
+void WordSplitState<T>::Advance() {
+ MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index");
+ MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
+ "Length beyond end");
+
+ mDOMWordOffset++;
+ if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
+ mCurCharClass = CHAR_CLASS_END_OF_INPUT;
+ else
+ mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
+}
+
+// WordSplitState::AdvanceThroughSeparators
+template <class T>
+void WordSplitState<T>::AdvanceThroughSeparators() {
+ while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();
+}
+
+// WordSplitState::AdvanceThroughWord
+template <class T>
+void WordSplitState<T>::AdvanceThroughWord() {
+ while (mCurCharClass == CHAR_CLASS_WORD) Advance();
+}
+
+// WordSplitState::IsSpecialWord
+template <class T>
+bool WordSplitState<T>::IsSpecialWord() const {
+ // Search for email addresses. We simply define these as any sequence of
+ // characters with an '@' character in the middle. The DOM word is already
+ // split on whitepace, so we know that everything to the end is the address
+ int32_t firstColon = -1;
+ for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {
+ if (mDOMWordText[i] == '@') {
+ // only accept this if there are unambiguous word characters (don't bother
+ // recursing to disambiguate apostrophes) on each side. This prevents
+ // classifying, e.g. "@home" as an email address
+
+ // Use this condition to only accept words with '@' in the middle of
+ // them. It works, but the inlinespellcker doesn't like this. The problem
+ // is that you type "fhsgfh@" that's a misspelled word followed by a
+ // symbol, but when you type another letter "fhsgfh@g" that first word
+ // need to be unmarked misspelled. It doesn't do this. it only checks the
+ // current position for potentially removing a spelling range.
+ if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
+ i < (int32_t)mDOMWordText.Length() - 1 &&
+ ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
+ return true;
+ }
+ } else if (mDOMWordText[i] == ':' && firstColon < 0) {
+ firstColon = i;
+
+ // If the first colon is followed by a slash, consider it a URL
+ // This will catch things like asdf://foo.com
+ if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
+ mDOMWordText[firstColon + 1] == '/') {
+ return true;
+ }
+ }
+ }
+
+ // Check the text before the first colon against some known protocols. It
+ // is impossible to check against all protocols, especially since you can
+ // plug in new protocols. We also don't want to waste time here checking
+ // against a lot of obscure protocols.
+ if (firstColon > mDOMWordOffset) {
+ nsString protocol(
+ Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));
+ if (protocol.EqualsIgnoreCase("http") ||
+ protocol.EqualsIgnoreCase("https") ||
+ protocol.EqualsIgnoreCase("news") ||
+ protocol.EqualsIgnoreCase("file") ||
+ protocol.EqualsIgnoreCase("javascript") ||
+ protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {
+ return true;
+ }
+ }
+
+ // not anything special
+ return false;
+}
+
+// WordSplitState::ShouldSkipWord
+template <class T>
+bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
+ int32_t last = aStart + aLength;
+
+ // check to see if the word contains a digit
+ for (int32_t i = aStart; i < last; i++) {
+ if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==
+ nsUGenCategory::kNumber) {
+ return true;
+ }
+ }
+
+ // not special
+ return false;
+}
+
+template <class T>
+Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence(
+ const int32_t aBeforeOffset) const {
+ for (int32_t i = aBeforeOffset - 1; i >= 0; --i) {
+ if (IsDOMWordSeparator(mDOMWordText[i]) ||
+ (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&
+ ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {
+ // Be greedy, find as many separators as we can
+ for (int32_t j = i - 1; j >= 0; --j) {
+ if (IsDOMWordSeparator(mDOMWordText[j]) ||
+ (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&
+ ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {
+ i = j;
+ } else {
+ break;
+ }
+ }
+ return Some(i);
+ }
+ }
+ return Nothing();
+}
+
+template <>
+char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
+ int32_t aIndex) const {
+ return mDOMWordText[aIndex];
+}
+
+template <>
+char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
+ int32_t aIndex) const {
+ return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
+}
+
+static inline bool IsBRElement(nsINode* aNode) {
+ return aNode->IsHTMLElement(nsGkAtoms::br);
+}
+
+/**
+ * Given a TextNode, finds the last sequence of DOM word separators before
+ * aBeforeOffset and returns the offset to its first element.
+ *
+ * @param aContent the TextNode to check.
+ * @param aBeforeOffset the offset in the TextNode before which we will search
+ * for the DOM separator. You can pass INT32_MAX to search the entire
+ * length of the string.
+ */
+static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
+ nsIContent* aContent, int32_t aBeforeOffset) {
+ const nsTextFragment* textFragment = aContent->GetText();
+ MOZ_ASSERT(textFragment, "Where is our text?");
+ int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));
+
+ if (textFragment->Is2b()) {
+ nsDependentSubstring targetText(textFragment->Get2b(), end);
+ WordSplitState<nsDependentSubstring> state(targetText);
+ return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
+ }
+
+ nsDependentCSubstring targetText(textFragment->Get1b(), end);
+ WordSplitState<nsDependentCSubstring> state(targetText);
+ return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
+}
+
+/**
+ * Check if there's a DOM word separator before aBeforeOffset in this node.
+ * Always returns true if it's a BR element.
+ * aSeparatorOffset is set to the index of the first character in the last
+ * separator if any is found (0 for BR elements).
+ *
+ * This function does not modify aSeparatorOffset when it returns false.
+ */
+static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
+ int32_t* aSeparatorOffset) {
+ if (IsBRElement(aNode)) {
+ *aSeparatorOffset = 0;
+ return true;
+ }
+
+ if (!IsSpellCheckingTextNode(aNode)) return false;
+
+ const Maybe<int32_t> separatorOffset =
+ FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(),
+ aBeforeOffset);
+ if (separatorOffset) {
+ *aSeparatorOffset = *separatorOffset;
+ return true;
+ }
+
+ return false;
+}
+
+static bool IsBreakElement(nsINode* aNode) {
+ if (!aNode->IsElement()) {
+ return false;
+ }
+
+ dom::Element* element = aNode->AsElement();
+ if (element->IsHTMLElement(nsGkAtoms::br)) {
+ return true;
+ }
+
+ // If we don't have a frame, we don't consider ourselves a break
+ // element. In particular, words can span us.
+ nsIFrame* frame = element->GetPrimaryFrame();
+ if (!frame) {
+ return false;
+ }
+
+ auto* disp = frame->StyleDisplay();
+ // Anything that's not an inline element is a break element.
+ // XXXbz should replaced inlines be break elements, though?
+ // Also should inline-block and such be break elements?
+ //
+ // FIXME(emilio): We should teach the spell checker to deal with generated
+ // content (it doesn't at all), then remove the IsListItem() check, as there
+ // could be no marker, etc...
+ return !disp->IsInlineFlow() || disp->IsListItem();
+}
+
+struct CheckLeavingBreakElementClosure {
+ bool mLeftBreakElement;
+};
+
+static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {
+ CheckLeavingBreakElementClosure* cl =
+ static_cast<CheckLeavingBreakElementClosure*>(aClosure);
+ if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
+ cl->mLeftBreakElement = true;
+ }
+}
+
+void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {
+ nsAutoString result;
+ ::NormalizeWord(aWord, 0, aWord.Length(), result);
+ aWord = result;
+}
+
+void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(
+ NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) {
+ MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));
+
+ mBegin = std::move(aBegin);
+ mEnd = std::move(aEnd);
+
+ // First we have to work backwards from mBegin to find a text node
+ // containing a DOM word separator, a non-inline-element
+ // boundary, or the hard start node. That's where we'll start building the
+ // soft string from.
+ nsINode* node = mBegin.mNode;
+ int32_t firstOffsetInNode = 0;
+ int32_t checkBeforeOffset = mBegin.mOffset;
+ while (node) {
+ if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
+ if (node == mBegin.mNode) {
+ // If we find a word separator on the first node, look at the preceding
+ // word on the text node as well.
+ if (firstOffsetInNode > 0) {
+ // Try to find the previous word boundary in the current node. If
+ // we can't find one, start checking previous sibling nodes (if any
+ // adjacent ones exist) to see if we can find any text nodes with
+ // DOM word separators. We bail out as soon as we see a node that is
+ // not a text node, or we run out of previous sibling nodes. In the
+ // event that we simply cannot find any preceding word separator, the
+ // offset is set to 0, and the soft text beginning node is set to the
+ // "most previous" text node before the original starting node, or
+ // kept at the original starting node if no previous text nodes exist.
+ int32_t newOffset = 0;
+ if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
+ &newOffset)) {
+ nsIContent* prevNode = node->GetPreviousSibling();
+ while (prevNode && IsSpellCheckingTextNode(prevNode)) {
+ mBegin.mNode = prevNode;
+ const Maybe<int32_t> separatorOffset =
+ FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX);
+ if (separatorOffset) {
+ newOffset = *separatorOffset;
+ break;
+ }
+ prevNode = prevNode->GetPreviousSibling();
+ }
+ }
+ firstOffsetInNode = newOffset;
+ } else {
+ firstOffsetInNode = 0;
+ }
+
+ MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
+ ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__,
+ mBegin.mOffset, firstOffsetInNode));
+ mBegin.mOffset = firstOffsetInNode;
+ }
+ break;
+ }
+ checkBeforeOffset = INT32_MAX;
+ if (IsBreakElement(node)) {
+ // Since GetPreviousContent follows tree *preorder*, we're about to
+ // traverse up out of 'node'. Since node induces breaks (e.g., it's a
+ // block), don't bother trying to look outside it, just stop now.
+ break;
+ }
+ // GetPreviousContent below expects aRootNode to be an ancestor of node.
+ if (!node->IsInclusiveDescendantOf(aRootNode)) {
+ break;
+ }
+ node = node->GetPreviousContent(aRootNode);
+ }
+
+ // Now build up the string moving forward through the DOM until we reach
+ // the soft end and *then* see a DOM word separator, a non-inline-element
+ // boundary, or the hard end node.
+ mValue.Truncate();
+ mDOMMapping.Clear();
+ bool seenSoftEnd = false;
+ // Leave this outside the loop so large heap string allocations can be reused
+ // across iterations
+ while (node) {
+ if (node == mEnd.mNode) {
+ seenSoftEnd = true;
+ }
+
+ bool exit = false;
+ if (IsSpellCheckingTextNode(node)) {
+ nsIContent* content = static_cast<nsIContent*>(node);
+ MOZ_ASSERT(content, "Where is our content?");
+ const nsTextFragment* textFragment = content->GetText();
+ MOZ_ASSERT(textFragment, "Where is our text?");
+ uint32_t lastOffsetInNode = textFragment->GetLength();
+
+ if (seenSoftEnd) {
+ // check whether we can stop after this
+ for (uint32_t i =
+ node == mEnd.mNode ? AssertedCast<uint32_t>(mEnd.mOffset) : 0;
+ i < textFragment->GetLength(); ++i) {
+ if (IsDOMWordSeparator(textFragment->CharAt(i))) {
+ exit = true;
+ // stop at the first separator after the soft end point
+ lastOffsetInNode = i;
+ break;
+ }
+ }
+ }
+
+ if (firstOffsetInNode >= 0 &&
+ static_cast<uint32_t>(firstOffsetInNode) < lastOffsetInNode) {
+ const uint32_t len = lastOffsetInNode - firstOffsetInNode;
+ mDOMMapping.AppendElement(DOMTextMapping(
+ NodeOffset(node, firstOffsetInNode), mValue.Length(), len));
+
+ const bool ok = textFragment->AppendTo(
+ mValue, static_cast<uint32_t>(firstOffsetInNode), len,
+ mozilla::fallible);
+ if (!ok) {
+ // probably out of memory, remove from mDOMMapping
+ mDOMMapping.RemoveLastElement();
+ exit = true;
+ }
+ }
+
+ firstOffsetInNode = 0;
+ }
+
+ if (exit) break;
+
+ CheckLeavingBreakElementClosure closure = {false};
+ node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure);
+ if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
+ // We left, or are entering, a break element (e.g., block). Maybe we can
+ // stop now.
+ if (seenSoftEnd) break;
+ // Record the break
+ mValue.Append(' ');
+ }
+ }
+
+ MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
+ ("%s: got DOM string: %s", __FUNCTION__,
+ NS_ConvertUTF16toUTF8(mValue).get()));
+}
+
+auto mozInlineSpellWordUtil::BuildRealWords() const
+ -> Result<RealWords, nsresult> {
+ // This is pretty simple. We just have to walk mSoftText.GetValue(),
+ // tokenizing it into "real words". We do an outer traversal of words
+ // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of
+ // those DOM words
+ int32_t wordStart = -1;
+ RealWords realWords;
+ for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) {
+ if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) {
+ if (wordStart >= 0) {
+ nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords);
+ if (NS_FAILED(rv)) {
+ return Err(rv);
+ }
+ wordStart = -1;
+ }
+ } else {
+ if (wordStart < 0) {
+ wordStart = i;
+ }
+ }
+ }
+ if (wordStart >= 0) {
+ nsresult rv = SplitDOMWordAndAppendTo(
+ wordStart, mSoftText.GetValue().Length(), realWords);
+ if (NS_FAILED(rv)) {
+ return Err(rv);
+ }
+ }
+
+ return realWords;
+}
+
+/*********** DOM/realwords<->mSoftText.GetValue() mapping functions
+ * ************/
+
+int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
+ const NodeOffset& aNodeOffset) const {
+ if (!mSoftText.mIsValid) {
+ NS_ERROR("Soft text must be valid if we're to map into it");
+ return -1;
+ }
+
+ for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) {
+ const DOMTextMapping& map = mSoftText.GetDOMMapping()[i];
+ if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
+ // Allow offsets at either end of the string, in particular, allow the
+ // offset that's at the end of the contributed string
+ int32_t offsetInContributedString =
+ aNodeOffset.mOffset - map.mNodeOffset.mOffset;
+ if (offsetInContributedString >= 0 &&
+ offsetInContributedString <= map.mLength)
+ return map.mSoftTextOffset + offsetInContributedString;
+ return -1;
+ }
+ }
+ return -1;
+}
+
+namespace {
+
+template <class T>
+class FirstLargerOffset {
+ int32_t mSoftTextOffset;
+
+ public:
+ explicit FirstLargerOffset(int32_t aSoftTextOffset)
+ : mSoftTextOffset(aSoftTextOffset) {}
+ int operator()(const T& t) const {
+ // We want the first larger offset, so never return 0 (which would
+ // short-circuit evaluation before finding the last such offset).
+ return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
+ }
+};
+
+template <class T>
+bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,
+ int32_t aSoftTextOffset, size_t* aIndex) {
+ if (aContainer.Length() == 0) {
+ return false;
+ }
+
+ BinarySearchIf(aContainer, 0, aContainer.Length(),
+ FirstLargerOffset<T>(aSoftTextOffset), aIndex);
+ if (*aIndex > 0) {
+ // There was at least one mapping with offset <= aSoftTextOffset. Step back
+ // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
+ *aIndex -= 1;
+ } else {
+ // Every mapping had offset greater than aSoftTextOffset.
+ MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
+ }
+ return true;
+}
+
+} // namespace
+
+NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
+ int32_t aSoftTextOffset, DOMMapHint aHint) const {
+ MOZ_ASSERT(mSoftText.mIsValid,
+ "Soft text must be valid if we're to map out of it");
+ if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1);
+
+ // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
+ size_t index;
+ bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(),
+ aSoftTextOffset, &index);
+ if (!found) {
+ return NodeOffset(nullptr, -1);
+ }
+
+ // 'index' is now the last mapping, if any, such that
+ // mSoftTextOffset <= aSoftTextOffset.
+ // If we're doing HINT_END, then we may want to return the end of the
+ // the previous mapping instead of the start of this mapping
+ if (aHint == HINT_END && index > 0) {
+ const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1];
+ if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
+ return NodeOffset(map.mNodeOffset.mNode,
+ map.mNodeOffset.mOffset + map.mLength);
+ }
+
+ // We allow ourselves to return the end of this mapping even if we're
+ // doing HINT_START. This will only happen if there is no mapping which this
+ // point is the start of. I'm not 100% sure this is OK...
+ const DOMTextMapping& map = mSoftText.GetDOMMapping()[index];
+ int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
+ if (offset >= 0 && offset <= map.mLength)
+ return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
+
+ return NodeOffset(nullptr, -1);
+}
+
+// static
+void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,
+ nsACString& aResult) {
+ switch (aHint) {
+ case HINT_BEGIN:
+ aResult.AssignLiteral("begin");
+ break;
+ case HINT_END:
+ aResult.AssignLiteral("end");
+ break;
+ }
+}
+
+int32_t mozInlineSpellWordUtil::FindRealWordContaining(
+ int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {
+ if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {
+ nsAutoCString hint;
+ mozInlineSpellWordUtil::ToString(aHint, hint);
+
+ MOZ_LOG(
+ sInlineSpellWordUtilLog, LogLevel::Debug,
+ ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,
+ aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));
+ }
+
+ MOZ_ASSERT(mSoftText.mIsValid,
+ "Soft text must be valid if we're to map out of it");
+ if (!mSoftText.mIsValid) return -1;
+
+ // Find the last word, if any, such that mRealWords[index].mSoftTextOffset
+ // <= aSoftTextOffset
+ size_t index;
+ bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
+ if (!found) {
+ return -1;
+ }
+
+ // 'index' is now the last word, if any, such that
+ // mSoftTextOffset <= aSoftTextOffset.
+ // If we're doing HINT_END, then we may want to return the end of the
+ // the previous word instead of the start of this word
+ if (aHint == HINT_END && index > 0) {
+ const RealWord& word = mRealWords[index - 1];
+ if (word.EndOffset() == aSoftTextOffset) {
+ return index - 1;
+ }
+ }
+
+ // We allow ourselves to return the end of this word even if we're
+ // doing HINT_BEGIN. This will only happen if there is no word which this
+ // point is the start of. I'm not 100% sure this is OK...
+ const RealWord& word = mRealWords[index];
+ int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
+ if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;
+
+ if (aSearchForward) {
+ if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
+ // All words have mSoftTextOffset > aSoftTextOffset
+ return 0;
+ }
+ // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
+ // Word index+1, if it exists, will be the first with
+ // mSoftTextOffset > aSoftTextOffset.
+ if (index + 1 < mRealWords.Length()) return index + 1;
+ }
+
+ return -1;
+}
+
+// mozInlineSpellWordUtil::SplitDOMWordAndAppendTo
+
+nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(
+ int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const {
+ nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart);
+ WordSplitState<nsDependentSubstring> state(targetText);
+ state.mCurCharClass = state.ClassifyCharacter(0, true);
+
+ state.AdvanceThroughSeparators();
+ if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {
+ int32_t specialWordLength =
+ state.mDOMWordText.Length() - state.mDOMWordOffset;
+ if (!aRealWords.AppendElement(
+ RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
+ fallible)) {
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+
+ return NS_OK;
+ }
+
+ while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
+ state.AdvanceThroughSeparators();
+ if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;
+
+ // save the beginning of the word
+ int32_t wordOffset = state.mDOMWordOffset;
+
+ // find the end of the word
+ state.AdvanceThroughWord();
+ int32_t wordLen = state.mDOMWordOffset - wordOffset;
+ if (!aRealWords.AppendElement(
+ RealWord(aStart + wordOffset, wordLen,
+ !state.ShouldSkipWord(wordOffset, wordLen)),
+ fallible)) {
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+ }
+
+ return NS_OK;
+}