summaryrefslogtreecommitdiffstats
path: root/accessible/atk/DOMtoATK.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /accessible/atk/DOMtoATK.h
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'accessible/atk/DOMtoATK.h')
-rw-r--r--accessible/atk/DOMtoATK.h152
1 files changed, 152 insertions, 0 deletions
diff --git a/accessible/atk/DOMtoATK.h b/accessible/atk/DOMtoATK.h
new file mode 100644
index 0000000000..322358bc6e
--- /dev/null
+++ b/accessible/atk/DOMtoATK.h
@@ -0,0 +1,152 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <glib.h>
+#include <cstdint>
+#include "mozilla/a11y/HyperTextAccessibleBase.h"
+#include "nsCharTraits.h"
+#include "nsString.h"
+
+/**
+ * ATK offsets are counted in unicode codepoints, while DOM offsets are counted
+ * in UTF-16 code units. That makes a difference for non-BMP characters,
+ * which need two UTF-16 code units to be represented (a pair of surrogates),
+ * while they are just one unicode character.
+ *
+ * To keep synchronization between ATK offsets (unicode codepoints) and DOM
+ * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a
+ * BOM after each non-BMP character (which would otherwise use 2 UTF-16
+ * code units for only 1 unicode codepoint).
+ *
+ * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but
+ * that usage is deprecated) normally only appear at the beginning of unicode
+ * files, but their occurrence within text (notably after cut&paste) is not
+ * uncommon, and are thus considered as non-text.
+ *
+ * Since the selection requested through ATK may not contain both surrogates
+ * at the ends of the selection, we need to fetch one UTF-16 code point more
+ * on both side, and get rid of it before returning the string to ATK. The
+ * ATKStringConverterHelper class maintains this, NewATKString should be used
+ * to call it properly.
+ *
+ * In the end,
+ * - if the start is between the high and low surrogates, the UTF-8 result
+ * includes a BOM from it but not the character
+ * - if the end is between the high and low surrogates, the UTF-8 result
+ * includes the character but *not* the BOM
+ * - all non-BMP characters that are fully in the string are in the UTF-8 result
+ * as character followed by BOM
+ */
+namespace mozilla {
+namespace a11y {
+
+namespace DOMtoATK {
+
+/**
+ * Converts a string of accessible text into ATK gchar* string (by adding
+ * BOMs). This can be used when offsets do not need to be adjusted because
+ * ends of the string can not fall between surrogates.
+ */
+gchar* Convert(const nsAString& aStr);
+
+/**
+ * Add a BOM after each non-BMP character.
+ */
+void AddBOMs(nsACString& aDest, const nsACString& aSource);
+
+class ATKStringConverterHelper {
+ public:
+ ATKStringConverterHelper(void)
+ :
+#ifdef DEBUG
+ mAdjusted(false),
+#endif
+ mStartShifted(false),
+ mEndShifted(false) {
+ }
+
+ /**
+ * In order to properly get non-BMP values, offsets need to be changed
+ * to get one character more on each end, so that ConvertUTF16toUTF8 can
+ * convert surrogates even if the originally requested offsets fall between
+ * them.
+ */
+ void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);
+
+ /**
+ * Converts a string of accessible text with adjusted offsets into ATK
+ * gchar* string (by adding BOMs). Note, AdjustOffsets has to be called
+ * before getting the text passed to this.
+ */
+ gchar* ConvertAdjusted(const nsAString& aStr);
+
+ private:
+ /**
+ * Remove the additional characters requested by PrepareUTF16toUTF8.
+ */
+ gchar* FinishUTF16toUTF8(nsCString& aStr);
+
+#ifdef DEBUG
+ bool mAdjusted;
+#endif
+ bool mStartShifted;
+ bool mEndShifted;
+};
+
+/**
+ * Get text from aAccessible, using ATKStringConverterHelper to properly
+ * introduce appropriate BOMs.
+ */
+inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible,
+ gint aStartOffset, gint aEndOffset) {
+ gint startOffset = aStartOffset, endOffset = aEndOffset;
+ ATKStringConverterHelper converter;
+ converter.AdjustOffsets(&startOffset, &endOffset,
+ gint(aAccessible->CharacterCount()));
+ nsAutoString str;
+ aAccessible->TextSubstring(startOffset, endOffset, str);
+
+ if (str.Length() == 0) {
+ // Bogus offsets, or empty string, either way we do not need conversion.
+ return g_strdup("");
+ }
+
+ return converter.ConvertAdjusted(str);
+}
+
+/**
+ * Get a character from aAccessible, fetching more data as appropriate to
+ * properly get non-BMP characters or a BOM as appropriate.
+ */
+inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible,
+ gint aOffset) {
+ // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.
+ gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));
+
+ if (NS_IS_LOW_SURROGATE(character)) {
+ // Trailing surrogate, return BOM instead.
+ return 0xFEFF;
+ }
+
+ if (NS_IS_HIGH_SURROGATE(character)) {
+ // Heading surrogate, get the trailing surrogate and combine them.
+ gunichar characterLow =
+ static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));
+
+ if (!NS_IS_LOW_SURROGATE(characterLow)) {
+ // It should have been a trailing surrogate... Flag the error.
+ return 0xFFFD;
+ }
+ return SURROGATE_TO_UCS4(character, characterLow);
+ }
+
+ return character;
+}
+
+} // namespace DOMtoATK
+
+} // namespace a11y
+} // namespace mozilla