summaryrefslogtreecommitdiffstats
path: root/accessible/atk/DOMtoATK.h
blob: 322358bc6e7b13cfbf24c3f3c912bd4d2162695d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include <glib.h>
#include <cstdint>
#include "mozilla/a11y/HyperTextAccessibleBase.h"
#include "nsCharTraits.h"
#include "nsString.h"

/**
 * ATK offsets are counted in unicode codepoints, while DOM offsets are counted
 * in UTF-16 code units.  That makes a difference for non-BMP characters,
 * which need two UTF-16 code units to be represented (a pair of surrogates),
 * while they are just one unicode character.
 *
 * To keep synchronization between ATK offsets (unicode codepoints) and DOM
 * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a
 * BOM after each non-BMP character (which would otherwise use 2 UTF-16
 * code units for only 1 unicode codepoint).
 *
 * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but
 * that usage is deprecated) normally only appear at the beginning of unicode
 * files, but their occurrence within text (notably after cut&paste) is not
 * uncommon, and are thus considered as non-text.
 *
 * Since the selection requested through ATK may not contain both surrogates
 * at the ends of the selection, we need to fetch one UTF-16 code point more
 * on both side, and get rid of it before returning the string to ATK. The
 * ATKStringConverterHelper class maintains this, NewATKString should be used
 * to call it properly.
 *
 * In the end,
 * - if the start is between the high and low surrogates, the UTF-8 result
 * includes a BOM from it but not the character
 * - if the end is between the high and low surrogates, the UTF-8 result
 * includes the character but *not* the BOM
 * - all non-BMP characters that are fully in the string are in the UTF-8 result
 * as character followed by BOM
 */
namespace mozilla {
namespace a11y {

namespace DOMtoATK {

/**
 * Converts a string of accessible text into ATK gchar* string (by adding
 * BOMs). This can be used when offsets do not need to be adjusted because
 * ends of the string can not fall between surrogates.
 */
gchar* Convert(const nsAString& aStr);

/**
 * Add a BOM after each non-BMP character.
 */
void AddBOMs(nsACString& aDest, const nsACString& aSource);

class ATKStringConverterHelper {
 public:
  ATKStringConverterHelper(void)
      :
#ifdef DEBUG
        mAdjusted(false),
#endif
        mStartShifted(false),
        mEndShifted(false) {
  }

  /**
   * In order to properly get non-BMP values, offsets need to be changed
   * to get one character more on each end, so that ConvertUTF16toUTF8 can
   * convert surrogates even if the originally requested offsets fall between
   * them.
   */
  void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);

  /**
   * Converts a string of accessible text with adjusted offsets into ATK
   * gchar* string (by adding BOMs).  Note, AdjustOffsets has to be called
   * before getting the text passed to this.
   */
  gchar* ConvertAdjusted(const nsAString& aStr);

 private:
  /**
   * Remove the additional characters requested by PrepareUTF16toUTF8.
   */
  gchar* FinishUTF16toUTF8(nsCString& aStr);

#ifdef DEBUG
  bool mAdjusted;
#endif
  bool mStartShifted;
  bool mEndShifted;
};

/**
 * Get text from aAccessible, using ATKStringConverterHelper to properly
 * introduce appropriate BOMs.
 */
inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible,
                           gint aStartOffset, gint aEndOffset) {
  gint startOffset = aStartOffset, endOffset = aEndOffset;
  ATKStringConverterHelper converter;
  converter.AdjustOffsets(&startOffset, &endOffset,
                          gint(aAccessible->CharacterCount()));
  nsAutoString str;
  aAccessible->TextSubstring(startOffset, endOffset, str);

  if (str.Length() == 0) {
    // Bogus offsets, or empty string, either way we do not need conversion.
    return g_strdup("");
  }

  return converter.ConvertAdjusted(str);
}

/**
 * Get a character from aAccessible, fetching more data as appropriate to
 * properly get non-BMP characters or a BOM as appropriate.
 */
inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible,
                             gint aOffset) {
  // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.
  gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));

  if (NS_IS_LOW_SURROGATE(character)) {
    // Trailing surrogate, return BOM instead.
    return 0xFEFF;
  }

  if (NS_IS_HIGH_SURROGATE(character)) {
    // Heading surrogate, get the trailing surrogate and combine them.
    gunichar characterLow =
        static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));

    if (!NS_IS_LOW_SURROGATE(characterLow)) {
      // It should have been a trailing surrogate... Flag the error.
      return 0xFFFD;
    }
    return SURROGATE_TO_UCS4(character, characterLow);
  }

  return character;
}

}  // namespace DOMtoATK

}  // namespace a11y
}  // namespace mozilla