summaryrefslogtreecommitdiffstats
path: root/intl/components/src/Bidi.h
blob: 7b901e6bfdf4d051d5defbecd75079f12ec8a81c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef intl_components_Bidi_h_
#define intl_components_Bidi_h_

#include "mozilla/intl/BidiEmbeddingLevel.h"
#include "mozilla/intl/ICU4CGlue.h"

// Restrict use of the Rust unicode-bidi implementation to Nightly builds,
// pending investigation of perf regressions; Beta/Release builds will
// continue to use the ICU4C implementation for now.
#ifdef NIGHTLY_BUILD
#  define USE_RUST_UNICODE_BIDI 1
#else
#  define USE_RUST_UNICODE_BIDI 0
#endif

#if USE_RUST_UNICODE_BIDI
#  include "mozilla/intl/unicode_bidi_ffi_generated.h"
#else
struct UBiDi;
#endif

namespace mozilla::intl {

/**
 * This component is a Mozilla-focused API for working with bidirectional (bidi)
 * text. Text is commonly displayed left to right (LTR), especially for
 * Latin-based alphabets. However, languages like Arabic and Hebrew displays
 * text right to left (RTL). When displaying text, LTR and RTL text can be
 * combined together in the same paragraph. This class gives tools for working
 * with unidirectional, and mixed direction paragraphs.
 *
 * See the Unicode Bidirectional Algorithm document for implementation details:
 * https://unicode.org/reports/tr9/
 */
class Bidi final {
 public:
  Bidi();
  ~Bidi();

  // Not copyable or movable
  Bidi(const Bidi&) = delete;
  Bidi& operator=(const Bidi&) = delete;

  /**
   * This enum indicates the text direction for the set paragraph. Some
   * paragraphs are unidirectional, where they only have one direction, or a
   * paragraph could use both LTR and RTL. In this case the paragraph's
   * direction would be mixed.
   */
  enum class ParagraphDirection { LTR, RTL, Mixed };

  /**
   * Set the current paragraph of text to analyze for its bidi properties. This
   * performs the Unicode bidi algorithm as specified by:
   * https://unicode.org/reports/tr9/
   *
   * After setting the text, the other getter methods can be used to find out
   * the directionality of the paragraph text.
   */
  ICUResult SetParagraph(Span<const char16_t> aParagraph,
                         BidiEmbeddingLevel aLevel);

  /**
   * Get the embedding level for the paragraph that was set by SetParagraph.
   */
  BidiEmbeddingLevel GetParagraphEmbeddingLevel() const;

  /**
   * Get the directionality of the paragraph text that was set by SetParagraph.
   */
  ParagraphDirection GetParagraphDirection() const;

  /**
   * Get the number of runs. This function may invoke the actual reordering on
   * the Bidi object, after SetParagraph may have resolved only the levels of
   * the text. Therefore, `CountRuns` may have to allocate memory, and may fail
   * doing so.
   */
  Result<int32_t, ICUError> CountRuns();

  /**
   * Get the next logical run. The logical runs are a run of text that has the
   * same directionality and embedding level. These runs are in memory order,
   * and not in display order.
   *
   * Important! `Bidi::CountRuns` must be called before calling this method.
   *
   * @param aLogicalStart is the offset into the paragraph text that marks the
   *      logical start of the text.
   * @param aLogicalLimitOut is an out param that is the length of the string
   *      that makes up the logical run.
   * @param aLevelOut is an out parameter that returns the embedding level for
   *      the run
   */
  void GetLogicalRun(int32_t aLogicalStart, int32_t* aLogicalLimitOut,
                     BidiEmbeddingLevel* aLevelOut);

  /**
   * This is a convenience function that does not use the ICU Bidi object.
   * It is intended to be used for when an application has determined the
   * embedding levels of objects (character sequences) and just needs to have
   * them reordered (L2).
   *
   * @param aLevels is an array with `aLength` levels that have been
   *      determined by the application.
   *
   * @param aLength is the number of levels in the array, or, semantically,
   *      the number of objects to be reordered. It must be greater than 0.
   *
   * @param aIndexMap is a pointer to an array of `aLength`
   *      indexes which will reflect the reordering of the characters.
   *      The array does not need to be initialized.
   *      The index map will result in
   *        `aIndexMap[aVisualIndex]==aLogicalIndex`.
   */
  static void ReorderVisual(const BidiEmbeddingLevel* aLevels, int32_t aLength,
                            int32_t* aIndexMap);

  /**
   * This enum indicates the bidi character type of the first strong character
   * for the set paragraph.
   * LTR: bidi character type 'L'.
   * RTL: bidi character type 'R' or 'AL'.
   * Neutral: The rest of bidi character types.
   */
  enum class BaseDirection { LTR, RTL, Neutral };

  /**
   * Get the base direction of the text.
   */
  static BaseDirection GetBaseDirection(Span<const char16_t> aText);

  /**
   * Get one run's logical start, length, and directionality. In an RTL run, the
   * character at the logical start is visually on the right of the displayed
   * run. The length is the number of characters in the run.
   * `Bidi::CountRuns` should be called before the runs are retrieved.
   *
   * @param aRunIndex is the number of the run in visual order, in the
   *      range `[0..CountRuns-1]`.
   *
   * @param aLogicalStart is the first logical character index in the text.
   *      The pointer may be `nullptr` if this index is not needed.
   *
   * @param aLength is the number of characters (at least one) in the run.
   *      The pointer may be `nullptr` if this is not needed.
   *
   * Note that in right-to-left runs, the code places modifier letters before
   * base characters and second surrogates before first ones.
   */
  BidiDirection GetVisualRun(int32_t aRunIndex, int32_t* aLogicalStart,
                             int32_t* aLength);

 private:
#if USE_RUST_UNICODE_BIDI
  using UnicodeBidi = mozilla::intl::ffi::UnicodeBidi;
  struct BidiFreePolicy {
    void operator()(void* aPtr) {
      bidi_destroy(static_cast<UnicodeBidi*>(aPtr));
    }
  };
  mozilla::UniquePtr<UnicodeBidi, BidiFreePolicy> mBidi;
#else
  ICUPointer<UBiDi> mBidi = ICUPointer<UBiDi>(nullptr);

  /**
   * An array of levels that is the same length as the paragraph from
   * `Bidi::SetParagraph`.
   */
  const BidiEmbeddingLevel* mLevels = nullptr;

  /**
   * The length of the paragraph from `Bidi::SetParagraph`.
   */
  int32_t mLength = 0;
#endif
};

}  // namespace mozilla::intl
#endif