diff options
Diffstat (limited to '')
-rw-r--r-- | accessible/atk/DOMtoATK.h | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/accessible/atk/DOMtoATK.h b/accessible/atk/DOMtoATK.h new file mode 100644 index 0000000000..322358bc6e --- /dev/null +++ b/accessible/atk/DOMtoATK.h @@ -0,0 +1,152 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <glib.h> +#include <cstdint> +#include "mozilla/a11y/HyperTextAccessibleBase.h" +#include "nsCharTraits.h" +#include "nsString.h" + +/** + * ATK offsets are counted in unicode codepoints, while DOM offsets are counted + * in UTF-16 code units. That makes a difference for non-BMP characters, + * which need two UTF-16 code units to be represented (a pair of surrogates), + * while they are just one unicode character. + * + * To keep synchronization between ATK offsets (unicode codepoints) and DOM + * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a + * BOM after each non-BMP character (which would otherwise use 2 UTF-16 + * code units for only 1 unicode codepoint). + * + * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but + * that usage is deprecated) normally only appear at the beginning of unicode + * files, but their occurrence within text (notably after cut&paste) is not + * uncommon, and are thus considered as non-text. + * + * Since the selection requested through ATK may not contain both surrogates + * at the ends of the selection, we need to fetch one UTF-16 code point more + * on both side, and get rid of it before returning the string to ATK. The + * ATKStringConverterHelper class maintains this, NewATKString should be used + * to call it properly. + * + * In the end, + * - if the start is between the high and low surrogates, the UTF-8 result + * includes a BOM from it but not the character + * - if the end is between the high and low surrogates, the UTF-8 result + * includes the character but *not* the BOM + * - all non-BMP characters that are fully in the string are in the UTF-8 result + * as character followed by BOM + */ +namespace mozilla { +namespace a11y { + +namespace DOMtoATK { + +/** + * Converts a string of accessible text into ATK gchar* string (by adding + * BOMs). This can be used when offsets do not need to be adjusted because + * ends of the string can not fall between surrogates. + */ +gchar* Convert(const nsAString& aStr); + +/** + * Add a BOM after each non-BMP character. + */ +void AddBOMs(nsACString& aDest, const nsACString& aSource); + +class ATKStringConverterHelper { + public: + ATKStringConverterHelper(void) + : +#ifdef DEBUG + mAdjusted(false), +#endif + mStartShifted(false), + mEndShifted(false) { + } + + /** + * In order to properly get non-BMP values, offsets need to be changed + * to get one character more on each end, so that ConvertUTF16toUTF8 can + * convert surrogates even if the originally requested offsets fall between + * them. + */ + void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count); + + /** + * Converts a string of accessible text with adjusted offsets into ATK + * gchar* string (by adding BOMs). Note, AdjustOffsets has to be called + * before getting the text passed to this. + */ + gchar* ConvertAdjusted(const nsAString& aStr); + + private: + /** + * Remove the additional characters requested by PrepareUTF16toUTF8. + */ + gchar* FinishUTF16toUTF8(nsCString& aStr); + +#ifdef DEBUG + bool mAdjusted; +#endif + bool mStartShifted; + bool mEndShifted; +}; + +/** + * Get text from aAccessible, using ATKStringConverterHelper to properly + * introduce appropriate BOMs. + */ +inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible, + gint aStartOffset, gint aEndOffset) { + gint startOffset = aStartOffset, endOffset = aEndOffset; + ATKStringConverterHelper converter; + converter.AdjustOffsets(&startOffset, &endOffset, + gint(aAccessible->CharacterCount())); + nsAutoString str; + aAccessible->TextSubstring(startOffset, endOffset, str); + + if (str.Length() == 0) { + // Bogus offsets, or empty string, either way we do not need conversion. + return g_strdup(""); + } + + return converter.ConvertAdjusted(str); +} + +/** + * Get a character from aAccessible, fetching more data as appropriate to + * properly get non-BMP characters or a BOM as appropriate. + */ +inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible, + gint aOffset) { + // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib. + gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset)); + + if (NS_IS_LOW_SURROGATE(character)) { + // Trailing surrogate, return BOM instead. + return 0xFEFF; + } + + if (NS_IS_HIGH_SURROGATE(character)) { + // Heading surrogate, get the trailing surrogate and combine them. + gunichar characterLow = + static_cast<gunichar>(aAccessible->CharAt(aOffset + 1)); + + if (!NS_IS_LOW_SURROGATE(characterLow)) { + // It should have been a trailing surrogate... Flag the error. + return 0xFFFD; + } + return SURROGATE_TO_UCS4(character, characterLow); + } + + return character; +} + +} // namespace DOMtoATK + +} // namespace a11y +} // namespace mozilla |